@@ -325,3 +325,121 @@ def test_triton_kernel_mode_off(self):
325325 edge_program_manager ,
326326 "SDPA kernel export with triton_kernel_mode=OFF failed" ,
327327 )
328+
329+ def test_device_info_propagated_to_cuda_delegate_outputs (self ):
330+ """
331+ Test that device info is correctly propagated from export to serialization
332+ for CUDA delegate outputs.
333+
334+ This verifies the device propagation flow:
335+ 1. CudaPartitioner adds target_device="cuda:0" CompileSpec
336+ 2. PropagateDevicePass sets TensorSpec.device = CUDA for delegate outputs
337+ 3. Emitter serializes device info into ExtraTensorInfo.device_type
338+ 4. Serialized tensors have device_type = DeviceType.CUDA
339+
340+ Note: At this stage, the tensor memory is still on CPU. The CUDA backend
341+ will copy data to GPU device at runtime. Device info tagging is the first
342+ step toward full device-aware memory allocation.
343+ """
344+ from executorch .exir import schema
345+
346+ class AddModule (torch .nn .Module ):
347+ def forward (self , x : torch .Tensor , y : torch .Tensor ) -> torch .Tensor :
348+ return x + y
349+
350+ module = AddModule ()
351+ module .eval ()
352+ inputs = (torch .randn (2 , 3 ), torch .randn (2 , 3 ))
353+
354+ # Export to CUDA with full pipeline
355+ edge_program_manager = self ._export_to_cuda_with_lower (module , inputs )
356+ self .assertIsNotNone (edge_program_manager , "CUDA export failed" )
357+
358+ # Convert to ExecutorTorch and access the serialized program
359+ et_prog = edge_program_manager .to_executorch ()
360+ program = et_prog ._emitter_output .program
361+
362+ # Get the execution plan and verify delegate exists
363+ plan = program .execution_plan [0 ]
364+ self .assertGreater (
365+ len (plan .delegates ),
366+ 0 ,
367+ "Expected at least one delegate in the execution plan" ,
368+ )
369+
370+ # Find all serialized tensors with CUDA device type
371+ cuda_tensors = []
372+ for value in plan .values :
373+ if isinstance (value .val , schema .Tensor ):
374+ tensor = value .val
375+ if (
376+ tensor .extra_tensor_info is not None
377+ and tensor .extra_tensor_info .device_type == schema .DeviceType .CUDA
378+ ):
379+ cuda_tensors .append (tensor )
380+
381+ # The add operation produces 1 output tensor that should be tagged as CUDA
382+ # because it's a delegate output from the CUDA backend
383+ self .assertGreater (
384+ len (cuda_tensors ),
385+ 0 ,
386+ "Expected at least 1 tensor with CUDA device type for delegate output. "
387+ "Device info should be propagated from CudaPartitioner through "
388+ "PropagateDevicePass to the serialized tensor." ,
389+ )
390+
391+ def test_input_tensors_remain_cpu_device (self ):
392+ """
393+ Test that input tensors (not delegate outputs) remain on CPU device.
394+
395+ Input tensors are provided by the user and are not produced by delegates,
396+ so they should not be tagged with CUDA device info. Only delegate outputs
397+ should have device info propagated.
398+ """
399+ from executorch .exir import schema
400+
401+ class AddModule (torch .nn .Module ):
402+ def forward (self , x : torch .Tensor , y : torch .Tensor ) -> torch .Tensor :
403+ return x + y
404+
405+ module = AddModule ()
406+ module .eval ()
407+ inputs = (torch .randn (2 , 3 ), torch .randn (2 , 3 ))
408+
409+ # Export to CUDA
410+ edge_program_manager = self ._export_to_cuda_with_lower (module , inputs )
411+ et_prog = edge_program_manager .to_executorch ()
412+ program = et_prog ._emitter_output .program
413+
414+ plan = program .execution_plan [0 ]
415+
416+ # Count tensors by device type
417+ cpu_tensors = []
418+ cuda_tensors = []
419+
420+ for value in plan .values :
421+ if isinstance (value .val , schema .Tensor ):
422+ tensor = value .val
423+ if (
424+ tensor .extra_tensor_info is not None
425+ and tensor .extra_tensor_info .device_type == schema .DeviceType .CUDA
426+ ):
427+ cuda_tensors .append (tensor )
428+ else :
429+ # Either no extra_tensor_info or device_type is CPU (default)
430+ cpu_tensors .append (tensor )
431+
432+ # We should have both CPU tensors (inputs) and CUDA tensors (delegate outputs)
433+ # The exact count depends on the model structure, but:
434+ # - Inputs should be CPU (2 input tensors)
435+ # - Delegate outputs should be CUDA (1 output tensor)
436+ self .assertGreater (
437+ len (cpu_tensors ),
438+ 0 ,
439+ "Expected CPU tensors for model inputs" ,
440+ )
441+ self .assertGreater (
442+ len (cuda_tensors ),
443+ 0 ,
444+ "Expected CUDA tensors for delegate outputs" ,
445+ )
0 commit comments