diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl
index c8bf2847dcf..6a5c40f9857 100644
--- a/extension/tensor/targets.bzl
+++ b/extension/tensor/targets.bzl
@@ -24,6 +24,7 @@ def define_common_targets():
             ],
             visibility = ["PUBLIC"],
             deps = [
+                "//executorch/runtime/core:device_allocator",
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp
index bb76311bd67..a070716037f 100644
--- a/extension/tensor/tensor_ptr.cpp
+++ b/extension/tensor/tensor_ptr.cpp
@@ -10,6 +10,7 @@
 
 #include <numeric>
 
+#include <executorch/runtime/core/device_allocator.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace executorch {
@@ -61,7 +62,9 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
     executorch::aten::TensorShapeDynamism dynamism,
-    std::function<void(void*)> deleter) {
+    std::function<void(void*)> deleter,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index) {
   const auto dim = sizes.size();
   ET_CHECK_MSG(
       dim_order.empty() || dim_order.size() == dim,
@@ -101,6 +104,7 @@ TensorPtr make_tensor_ptr(
 
   strides = std::move(computed_strides);
 
+  TensorPtr cpu_tensor;
 #ifndef USE_ATEN_LIB
   executorch::aten::TensorImpl tensor_impl(
       type,
@@ -116,9 +120,9 @@ TensorPtr make_tensor_ptr(
       std::move(dim_order),
       std::move(strides),
       std::move(deleter));
-  const auto tensor_ptr = &storage->tensor;
-  return std::shared_ptr<executorch::aten::Tensor>(
-      std::move(storage), tensor_ptr);
+  const auto raw_tensor_ptr = &storage->tensor;
+  cpu_tensor = std::shared_ptr<executorch::aten::Tensor>(
+      std::move(storage), raw_tensor_ptr);
 #else
   auto options = c10::TensorOptions()
                      .dtype(c10::scalarTypeToTypeMeta(type))
@@ -136,8 +140,13 @@ TensorPtr make_tensor_ptr(
       c10::DispatchKeySet(c10::DispatchKey::CPU),
       options.dtype());
   tensor_impl->set_sizes_and_strides(sizes, strides);
-  return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
+  cpu_tensor =
+      std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
 #endif // USE_ATEN_LIB
+  if (device_type != runtime::etensor::DeviceType::CPU) {
+    return clone_tensor_ptr_to_device(cpu_tensor, device_type, device_index);
+  }
+  return cpu_tensor;
 }
 
 TensorPtr make_tensor_ptr(
@@ -146,7 +155,9 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::DimOrderType> dim_order,
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
-    executorch::aten::TensorShapeDynamism dynamism) {
+    executorch::aten::TensorShapeDynamism dynamism,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index) {
   ET_CHECK_MSG(
       data.size() ==
           executorch::aten::compute_numel(sizes.data(), sizes.size()) *
@@ -161,7 +172,9 @@ TensorPtr make_tensor_ptr(
       type,
       dynamism,
       // Data is moved into the deleter and is destroyed together with Storage.
-      [data = std::move(data)](void*) {});
+      [data = std::move(data)](void*) {},
+      device_type,
+      device_index);
 }
 
 TensorPtr clone_tensor_ptr(
@@ -248,5 +261,179 @@ runtime::Error resize_tensor_ptr(
           sizes.data(), sizes.size()));
 }
 
+// ---- Device tensor helpers ----
+
+namespace {
+
+#ifndef USE_ATEN_LIB
+struct DeviceStorage final {
+  executorch::aten::TensorImpl tensor_impl;
+  executorch::aten::Tensor tensor;
+  std::vector<executorch::aten::SizesType> sizes;
+  std::vector<executorch::aten::DimOrderType> dim_order;
+  std::vector<executorch::aten::StridesType> strides;
+  std::function<void(void*)> deleter;
+
+  DeviceStorage(
+      executorch::aten::TensorImpl&& tensor_impl,
+      std::vector<executorch::aten::SizesType>&& sizes,
+      std::vector<executorch::aten::DimOrderType>&& dim_order,
+      std::vector<executorch::aten::StridesType>&& strides,
+      std::function<void(void*)>&& deleter)
+      : tensor_impl(std::move(tensor_impl)),
+        tensor(&this->tensor_impl),
+        sizes(std::move(sizes)),
+        dim_order(std::move(dim_order)),
+        strides(std::move(strides)),
+        deleter(std::move(deleter)) {}
+
+  ~DeviceStorage() {
+    if (deleter) {
+      deleter(tensor_impl.mutable_data());
+    }
+  }
+};
+#endif // USE_ATEN_LIB
+
+TensorPtr make_tensor_ptr_with_device(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    executorch::aten::ScalarType type,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index,
+    std::function<void(void*)> deleter) {
+  const auto dim = sizes.size();
+  std::vector<executorch::aten::DimOrderType> dim_order(dim);
+  std::iota(dim_order.begin(), dim_order.end(), 0);
+
+  std::vector<executorch::aten::StridesType> strides(dim);
+  if (dim > 0) {
+    auto error = runtime::dim_order_to_stride(
+        sizes.data(), dim_order.data(), dim, strides.data());
+    ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
+  }
+
+#ifndef USE_ATEN_LIB
+  executorch::aten::TensorImpl tensor_impl(
+      type,
+      dim,
+      sizes.data(),
+      data,
+      dim_order.data(),
+      strides.data(),
+      dim > 0 ? executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND
+              : executorch::aten::TensorShapeDynamism::STATIC,
+      device_type,
+      device_index);
+  auto storage = std::make_shared<DeviceStorage>(
+      std::move(tensor_impl),
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      std::move(deleter));
+  const auto tensor_ptr = &storage->tensor;
+  return std::shared_ptr<executorch::aten::Tensor>(
+      std::move(storage), tensor_ptr);
+#else
+  (void)device_type;
+  (void)device_index;
+  return make_tensor_ptr(
+      std::move(sizes),
+      data,
+      type,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      std::move(deleter));
+#endif // USE_ATEN_LIB
+}
+
+} // namespace
+
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& cpu_tensor,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index) {
+  ET_CHECK_MSG(
+      device_type != runtime::etensor::DeviceType::CPU,
+      "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");
+
+  auto* allocator = runtime::get_device_allocator(device_type);
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device_type));
+
+  const auto nbytes = cpu_tensor->nbytes();
+  const auto* cpu_data = cpu_tensor->const_data_ptr();
+  ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");
+
+  auto result = allocator->allocate(nbytes, device_index);
+  ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
+  void* device_data = result.get();
+
+  auto err = allocator->copy_host_to_device(
+      device_data, cpu_data, nbytes, device_index);
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());
+
+  return make_tensor_ptr_with_device(
+      std::move(sizes),
+      device_data,
+      cpu_tensor->scalar_type(),
+      device_type,
+      device_index,
+      [allocator, device_index](void* ptr) {
+        allocator->deallocate(ptr, device_index);
+      });
+}
+
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
+  const auto nbytes = device_tensor->nbytes();
+  const auto* device_data = device_tensor->const_data_ptr();
+  ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");
+
+#ifndef USE_ATEN_LIB
+  const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type();
+  const auto device_index =
+      device_tensor->unsafeGetTensorImpl()->device_index();
+#else
+  const auto& aten_device = device_tensor->device();
+  ET_CHECK_MSG(!aten_device.is_cpu(), "Source tensor is already on CPU.");
+  auto device_type = runtime::etensor::DeviceType::CPU;
+  if (aten_device.is_cuda()) {
+    device_type = runtime::etensor::DeviceType::CUDA;
+  }
+  const auto device_index =
+      static_cast<runtime::etensor::DeviceIndex>(aten_device.index());
+#endif
+
+  ET_CHECK_MSG(
+      device_type != runtime::etensor::DeviceType::CPU,
+      "Source tensor is already on CPU.");
+
+  auto* allocator = runtime::get_device_allocator(device_type);
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device_type));
+
+  std::vector<uint8_t> cpu_data(nbytes);
+
+  auto err = allocator->copy_device_to_host(
+      cpu_data.data(), device_data, nbytes, device_index);
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      device_tensor->sizes().begin(), device_tensor->sizes().end());
+
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::move(cpu_data),
+      {},
+      {},
+      device_tensor->scalar_type());
+}
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 47124bdeca6..22f7fe7d8c1 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -17,6 +17,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/device.h>
 
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
 
@@ -40,6 +41,8 @@ using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
  * @param deleter A custom deleter function for managing the lifetime of the
  * data buffer. If provided, this deleter will be called when the managed Tensor
  * object is destroyed.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created Tensor.
  */
 TensorPtr make_tensor_ptr(
@@ -51,7 +54,10 @@ TensorPtr make_tensor_ptr(
         executorch::aten::ScalarType::Float,
     const executorch::aten::TensorShapeDynamism dynamism =
         executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
-    std::function<void(void*)> deleter = nullptr);
+    std::function<void(void*)> deleter = nullptr,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0);
 
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
@@ -63,6 +69,8 @@ TensorPtr make_tensor_ptr(
  * @param deleter A custom deleter function for managing the lifetime of the
  * data buffer. If provided, this deleter will be called when the managed Tensor
  * object is destroyed.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created Tensor.
  */
 inline TensorPtr make_tensor_ptr(
@@ -72,9 +80,20 @@ inline TensorPtr make_tensor_ptr(
         executorch::aten::ScalarType::Float,
     const executorch::aten::TensorShapeDynamism dynamism =
         executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
-    std::function<void(void*)> deleter = nullptr) {
+    std::function<void(void*)> deleter = nullptr,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   return make_tensor_ptr(
-      std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter));
+      std::move(sizes),
+      data,
+      {},
+      {},
+      type,
+      dynamism,
+      std::move(deleter),
+      device_type,
+      device_index);
 }
 
 /**
@@ -95,6 +114,8 @@ inline TensorPtr make_tensor_ptr(
  * @param type The scalar type of the tensor elements. If it differs from the
  * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
 template <
@@ -108,7 +129,10 @@ inline TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides = {},
     executorch::aten::ScalarType type = deduced_type,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   ET_CHECK_MSG(
       data.size() ==
           executorch::aten::compute_numel(sizes.data(), sizes.size()),
@@ -144,7 +168,9 @@ inline TensorPtr make_tensor_ptr(
         std::move(strides),
         type,
         dynamism,
-        [data_ptr = std::move(data_ptr)](void*) {});
+        [data_ptr = std::move(data_ptr)](void*) {},
+        device_type,
+        device_index);
   }
   const auto raw_data_ptr = data.data();
   auto data_ptr = std::make_shared<std::vector<T>>(std::move(data));
@@ -155,7 +181,9 @@ inline TensorPtr make_tensor_ptr(
       std::move(strides),
       type,
       dynamism,
-      [data_ptr = std::move(data_ptr)](void*) {});
+      [data_ptr = std::move(data_ptr)](void*) {},
+      device_type,
+      device_index);
 }
 
 /**
@@ -173,6 +201,8 @@ inline TensorPtr make_tensor_ptr(
  * @param type The scalar type of the tensor elements. If it differs from the
  * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
 template <
@@ -183,11 +213,21 @@ inline TensorPtr make_tensor_ptr(
     std::vector<T> data,
     executorch::aten::ScalarType type = deduced_type,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   std::vector<executorch::aten::SizesType> sizes{
       executorch::aten::SizesType(data.size())};
   return make_tensor_ptr(
-      std::move(sizes), std::move(data), {0}, {1}, type, dynamism);
+      std::move(sizes),
+      std::move(data),
+      {0},
+      {1},
+      type,
+      dynamism,
+      device_type,
+      device_index);
 }
 
 /**
@@ -210,6 +250,8 @@ inline TensorPtr make_tensor_ptr(
  * @param type The scalar type of the tensor elements. If it differs from the
  * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
 template <
@@ -223,14 +265,19 @@ inline TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides = {},
     executorch::aten::ScalarType type = deduced_type,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   return make_tensor_ptr(
       std::move(sizes),
       std::vector<T>(std::move(list)),
       std::move(dim_order),
       std::move(strides),
       type,
-      dynamism);
+      dynamism,
+      device_type,
+      device_index);
 }
 
 /**
@@ -250,6 +297,8 @@ inline TensorPtr make_tensor_ptr(
  * @param type The scalar type of the tensor elements. If it differs from the
  * deduced type, the data will be cast to this type if allowed.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr that manages the newly created TensorImpl.
  */
 template <
@@ -260,11 +309,21 @@ inline TensorPtr make_tensor_ptr(
     std::initializer_list<T> list,
     executorch::aten::ScalarType type = deduced_type,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   std::vector<executorch::aten::SizesType> sizes{
       executorch::aten::SizesType(list.size())};
   return make_tensor_ptr(
-      std::move(sizes), std::move(list), {0}, {1}, type, dynamism);
+      std::move(sizes),
+      std::move(list),
+      {0},
+      {1},
+      type,
+      dynamism,
+      device_type,
+      device_index);
 }
 
 /**
@@ -293,6 +352,8 @@ inline TensorPtr make_tensor_ptr(T value) {
  * @param strides A vector specifying the strides of each dimension.
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr managing the newly created Tensor.
  */
 TensorPtr make_tensor_ptr(
@@ -302,7 +363,10 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0);
 
 /**
  * Creates a TensorPtr that manages a Tensor with the specified properties.
@@ -315,6 +379,8 @@ TensorPtr make_tensor_ptr(
  * @param data A vector containing the raw memory for the tensor's data.
  * @param type The scalar type of the tensor elements.
  * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
  * @return A TensorPtr managing the newly created Tensor.
  */
 inline TensorPtr make_tensor_ptr(
@@ -322,9 +388,19 @@ inline TensorPtr make_tensor_ptr(
     std::vector<uint8_t> data,
     executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
     executorch::aten::TensorShapeDynamism dynamism =
-        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
   return make_tensor_ptr(
-      std::move(sizes), std::move(data), {}, {}, type, dynamism);
+      std::move(sizes),
+      std::move(data),
+      {},
+      {},
+      type,
+      dynamism,
+      device_type,
+      device_index);
 }
 
 /**
@@ -388,27 +464,59 @@ inline TensorPtr make_tensor_ptr(
       std::move(deleter));
 }
 
+/**
+ * Clones a CPU TensorPtr to a device TensorPtr.
+ *
+ * Allocates memory on the specified device and copies the tensor data from
+ * host to device using the DeviceAllocator registered for the given device
+ * type. The returned TensorPtr owns the device memory and will free it via
+ * the allocator when destroyed.
+ *
+ * Forward declaration to support make_tensor_ptr below usage.
+ *
+ * @param cpu_tensor The source CPU tensor whose data will be copied.
+ * @param device_type The target device type (e.g., DeviceType::CUDA).
+ * @param device_index The target device index (default 0).
+ * @return A TensorPtr backed by device memory containing the copied data.
+ */
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& cpu_tensor,
+    runtime::etensor::DeviceType device_type,
+    runtime::etensor::DeviceIndex device_index = 0);
+
 /**
  * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
  * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
+ * When device_type is not CPU, the tensor data is additionally copied to the
+ * specified device.
  *
  * @param tensor_ptr The source tensor pointer to alias.
  * @param sizes Optional sizes override.
  * @param dim_order Optional dimension order override.
  * @param strides Optional strides override.
- * @return A TensorPtr aliasing the same storage with requested metadata.
+ * @param device_type The target device type (default CPU, meaning no copy).
+ * @param device_index The target device index (default 0).
+ * @return A TensorPtr aliasing the same storage with requested metadata, or a
+ * device TensorPtr if device_type is not CPU.
  */
 inline TensorPtr make_tensor_ptr(
     const TensorPtr& tensor_ptr,
     std::vector<executorch::aten::SizesType> sizes = {},
     std::vector<executorch::aten::DimOrderType> dim_order = {},
-    std::vector<executorch::aten::StridesType> strides = {}) {
-  return make_tensor_ptr(
+    std::vector<executorch::aten::StridesType> strides = {},
+    runtime::etensor::DeviceType device_type =
+        runtime::etensor::DeviceType::CPU,
+    runtime::etensor::DeviceIndex device_index = 0) {
+  auto result = make_tensor_ptr(
       *tensor_ptr,
       std::move(sizes),
       std::move(dim_order),
       std::move(strides),
       [tensor_ptr](void*) {});
+  if (device_type != runtime::etensor::DeviceType::CPU) {
+    return clone_tensor_ptr_to_device(result, device_type, device_index);
+  }
+  return result;
 }
 
 /**
@@ -479,6 +587,18 @@ runtime::Error resize_tensor_ptr(
     TensorPtr& tensor,
     const std::vector<executorch::aten::SizesType>& sizes);
 
+/**
+ * Clones a device TensorPtr to a CPU TensorPtr.
+ *
+ * Allocates host memory and copies the tensor data from device to host using
+ * the DeviceAllocator registered for the source tensor's device type. The
+ * device type is determined from the source tensor's metadata.
+ *
+ * @param device_tensor The source device tensor whose data will be copied.
+ * @return A TensorPtr backed by CPU memory containing the copied data.
+ */
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor);
+
 } // namespace extension
 } // namespace executorch
 
diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl
index 5bf8c7019b8..807e16ec8c1 100644
--- a/extension/tensor/test/targets.bzl
+++ b/extension/tensor/test/targets.bzl
@@ -21,3 +21,14 @@ def define_common_targets():
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
         )
+
+        runtime.cxx_test(
+            name = "tensor_ptr_device_test" + aten_suffix,
+            srcs = [
+                "tensor_ptr_device_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/tensor:tensor" + aten_suffix,
+                "//executorch/runtime/core:device_allocator",
+            ],
+        )
diff --git a/extension/tensor/test/tensor_ptr_device_test.cpp b/extension/tensor/test/tensor_ptr_device_test.cpp
new file mode 100644
index 00000000000..41a002b9d2b
--- /dev/null
+++ b/extension/tensor/test/tensor_ptr_device_test.cpp
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor_ptr.h>
+
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+#include <cstring>
+
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/device_allocator.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+
+using namespace ::executorch::extension;
+using namespace ::executorch::runtime;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+namespace {
+
+// A fake device allocator that uses host memory (malloc/free/memcpy) to
+// simulate device memory operations, enabling end-to-end data roundtrip
+// verification without requiring actual device hardware.
+class FakeDeviceAllocator : public DeviceAllocator {
+ public:
+  explicit FakeDeviceAllocator(DeviceType type) : type_(type) {}
+
+  Result<void*> allocate(size_t nbytes, DeviceIndex /*index*/) override {
+    void* ptr = std::malloc(nbytes);
+    if (!ptr) {
+      return Error::MemoryAllocationFailed;
+    }
+    allocate_count_++;
+    return ptr;
+  }
+
+  void deallocate(void* ptr, DeviceIndex /*index*/) override {
+    std::free(ptr);
+    deallocate_count_++;
+  }
+
+  Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex /*index*/) override {
+    std::memcpy(dst, src, nbytes);
+    h2d_count_++;
+    return Error::Ok;
+  }
+
+  Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      DeviceIndex /*index*/) override {
+    std::memcpy(dst, src, nbytes);
+    d2h_count_++;
+    return Error::Ok;
+  }
+
+  DeviceType device_type() const override {
+    return type_;
+  }
+
+  int allocate_count_ = 0;
+  int deallocate_count_ = 0;
+  int h2d_count_ = 0;
+  int d2h_count_ = 0;
+
+ private:
+  DeviceType type_;
+};
+
+FakeDeviceAllocator g_fake_cuda_allocator(DeviceType::CUDA);
+
+struct RegisterFakeAllocator {
+  RegisterFakeAllocator() {
+    register_device_allocator(DeviceType::CUDA, &g_fake_cuda_allocator);
+  }
+};
+static RegisterFakeAllocator s_register;
+
+} // namespace
+
+class TensorPtrDeviceTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    runtime_init();
+  }
+
+  void SetUp() override {
+    g_fake_cuda_allocator.allocate_count_ = 0;
+    g_fake_cuda_allocator.deallocate_count_ = 0;
+    g_fake_cuda_allocator.h2d_count_ = 0;
+    g_fake_cuda_allocator.d2h_count_ = 0;
+  }
+};
+
+TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) {
+  auto cpu_tensor =
+      make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 3);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_NE(device_tensor->const_data_ptr(), nullptr);
+  EXPECT_NE(device_tensor->const_data_ptr(), cpu_tensor->const_data_ptr());
+
+#ifndef USE_ATEN_LIB
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 0);
+#endif
+
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1);
+}
+
+TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) {
+  float data[] = {10.0f, 20.0f, 30.0f, 40.0f};
+  auto cpu_tensor = make_tensor_ptr({2, 2}, data);
+  auto device_tensor =
+      make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 2);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_NE(device_tensor->const_data_ptr(), nullptr);
+  EXPECT_NE(device_tensor->const_data_ptr(), static_cast<void*>(data));
+
+#ifndef USE_ATEN_LIB
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+#endif
+
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1);
+}
+
+#ifndef USE_ATEN_LIB
+// clone_tensor_ptr_to_cpu relies on TensorImpl device metadata which is only
+// available in the non-ATen (ExecuTorch portable) path.
+TEST_F(TensorPtrDeviceTest, DeviceToCpuTensor) {
+  auto cpu_tensor =
+      make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(result_tensor->dim(), 2);
+  EXPECT_EQ(result_tensor->size(0), 2);
+  EXPECT_EQ(result_tensor->size(1), 3);
+  EXPECT_EQ(result_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+
+  auto* result_data = result_tensor->const_data_ptr<float>();
+  auto* original_data = cpu_tensor->const_data_ptr<float>();
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_FLOAT_EQ(result_data[i], original_data[i]);
+  }
+
+  EXPECT_EQ(g_fake_cuda_allocator.d2h_count_, 1);
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) {
+  const std::vector<float> original = {1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f};
+  auto cpu_tensor = make_tensor_ptr({2, 3}, original);
+
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip_tensor = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_NE(roundtrip_tensor->const_data_ptr(), cpu_tensor->const_data_ptr());
+  EXPECT_NE(
+      roundtrip_tensor->const_data_ptr(), device_tensor->const_data_ptr());
+
+  auto* result_data = roundtrip_tensor->const_data_ptr<float>();
+  for (size_t i = 0; i < original.size(); ++i) {
+    EXPECT_FLOAT_EQ(result_data[i], original[i]);
+  }
+
+  EXPECT_EQ(roundtrip_tensor->dim(), cpu_tensor->dim());
+  EXPECT_EQ(roundtrip_tensor->size(0), cpu_tensor->size(0));
+  EXPECT_EQ(roundtrip_tensor->size(1), cpu_tensor->size(1));
+  EXPECT_EQ(roundtrip_tensor->scalar_type(), cpu_tensor->scalar_type());
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, RoundtripInt32) {
+  auto cpu_tensor = make_tensor_ptr({4}, std::vector<int32_t>{10, 20, 30, 40});
+
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Int);
+  const std::vector<int32_t> expected = {10, 20, 30, 40};
+  auto* data = roundtrip->const_data_ptr<int32_t>();
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(data[i], expected[i]);
+  }
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(
+      cpu_tensor, DeviceType::CUDA, /*device_index=*/1);
+
+  EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[0], 1.0f);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[1], 2.0f);
+}
+#endif
+
+TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) {
+  {
+    auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+    auto device_tensor =
+        clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+    EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1);
+    EXPECT_EQ(g_fake_cuda_allocator.deallocate_count_, 0);
+  }
+  EXPECT_EQ(g_fake_cuda_allocator.deallocate_count_, 1);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) {
+  auto cpu_tensor = make_tensor_ptr({}, {42.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 0);
+  EXPECT_EQ(device_tensor->numel(), 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  EXPECT_EQ(roundtrip->dim(), 0);
+  EXPECT_EQ(roundtrip->numel(), 1);
+  EXPECT_FLOAT_EQ(roundtrip->const_data_ptr<float>()[0], 42.0f);
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) {
+  float raw_data[] = {100.0f, 200.0f, 300.0f};
+  auto cpu_tensor = make_tensor_ptr({3}, raw_data);
+  auto device_tensor =
+      make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->dim(), 1);
+  EXPECT_EQ(roundtrip->size(0), 3);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 100.0f);
+  EXPECT_FLOAT_EQ(data[1], 200.0f);
+  EXPECT_FLOAT_EQ(data[2], 300.0f);
+}
+#endif
+
+TEST_F(TensorPtrDeviceTest, ErrorCpuTargetDevice) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  ET_EXPECT_DEATH(clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CPU), "");
+}
+
+TEST_F(TensorPtrDeviceTest, ErrorNullRawData) {
+  auto null_tensor = make_tensor_ptr({2, 2}, nullptr);
+  ET_EXPECT_DEATH(
+      make_tensor_ptr(null_tensor, {}, {}, {}, DeviceType::CUDA), "");
+}
+
+TEST_F(TensorPtrDeviceTest, ErrorNullCpuTensorData) {
+  auto null_tensor = make_tensor_ptr({2, 2}, nullptr);
+  ET_EXPECT_DEATH(
+      clone_tensor_ptr_to_device(null_tensor, DeviceType::CUDA), "");
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, ErrorCpuTensorToCpu) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  ET_EXPECT_DEATH(clone_tensor_ptr_to_cpu(cpu_tensor), "");
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) {
+  auto device_tensor = make_tensor_ptr(
+      {2, 2},
+      std::vector<float>{1.0f, 2.0f, 3.0f, 4.0f},
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 2);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 2);
+  EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 1.0f);
+  EXPECT_FLOAT_EQ(data[1], 2.0f);
+  EXPECT_FLOAT_EQ(data[2], 3.0f);
+  EXPECT_FLOAT_EQ(data[3], 4.0f);
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) {
+  float raw[] = {5.0f, 6.0f, 7.0f};
+  auto device_tensor = make_tensor_ptr(
+      {3},
+      raw,
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      nullptr,
+      DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 1);
+  EXPECT_EQ(device_tensor->size(0), 3);
+  EXPECT_EQ(
+      device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA);
+  EXPECT_NE(device_tensor->const_data_ptr(), static_cast<void*>(raw));
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* data = roundtrip->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(data[0], 5.0f);
+  EXPECT_FLOAT_EQ(data[1], 6.0f);
+  EXPECT_FLOAT_EQ(data[2], 7.0f);
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) {
+  auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto result = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(result->unsafeGetTensorImpl()->device_type(), DeviceType::CPU);
+  EXPECT_EQ(result->unsafeGetTensorImpl()->device_index(), 0);
+}
+#endif
+
+TEST_F(TensorPtrDeviceTest, MultipleClonesFromSameSource) {
+  auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f});
+  auto device1 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto device2 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_NE(device1->const_data_ptr(), device2->const_data_ptr());
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 2);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 2);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) {
+  std::vector<float> data(24);
+  for (size_t i = 0; i < 24; ++i) {
+    data[i] = static_cast<float>(i);
+  }
+  auto cpu_tensor = make_tensor_ptr({2, 3, 4}, data);
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+
+  EXPECT_EQ(device_tensor->dim(), 3);
+  EXPECT_EQ(device_tensor->size(0), 2);
+  EXPECT_EQ(device_tensor->size(1), 3);
+  EXPECT_EQ(device_tensor->size(2), 4);
+
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+  auto* result = roundtrip->const_data_ptr<float>();
+  for (size_t i = 0; i < 24; ++i) {
+    EXPECT_FLOAT_EQ(result[i], static_cast<float>(i));
+  }
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, RoundtripDouble) {
+  auto cpu_tensor = make_tensor_ptr({3}, std::vector<double>{1.1, 2.2, 3.3});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Double);
+  auto* data = roundtrip->const_data_ptr<double>();
+  EXPECT_DOUBLE_EQ(data[0], 1.1);
+  EXPECT_DOUBLE_EQ(data[1], 2.2);
+  EXPECT_DOUBLE_EQ(data[2], 3.3);
+}
+#endif
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, RoundtripInt64) {
+  auto cpu_tensor = make_tensor_ptr({3}, std::vector<int64_t>{100, 200, 300});
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Long);
+  auto* data = roundtrip->const_data_ptr<int64_t>();
+  EXPECT_EQ(data[0], 100);
+  EXPECT_EQ(data[1], 200);
+  EXPECT_EQ(data[2], 300);
+}
+#endif
+
+TEST_F(TensorPtrDeviceTest, CpuToCpuDefaultPreserved) {
+  auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f});
+  auto result = make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CPU);
+
+  EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 0);
+  EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 0);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrDeviceTest, LargeTensorRoundtrip) {
+  const size_t n = 10000;
+  std::vector<float> data(n);
+  for (size_t i = 0; i < n; ++i) {
+    data[i] = static_cast<float>(i) * 0.1f;
+  }
+  auto cpu_tensor = make_tensor_ptr({static_cast<int32_t>(n)}, data);
+  auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA);
+  auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor);
+
+  auto* result = roundtrip->const_data_ptr<float>();
+  for (size_t i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(result[i], data[i]);
+  }
+}
+#endif