diff --git a/extension/tensor/targets.bzl b/extension/tensor/targets.bzl index c8bf2847dcf..6a5c40f9857 100644 --- a/extension/tensor/targets.bzl +++ b/extension/tensor/targets.bzl @@ -24,6 +24,7 @@ def define_common_targets(): ], visibility = ["PUBLIC"], deps = [ + "//executorch/runtime/core:device_allocator", "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, ], diff --git a/extension/tensor/tensor_ptr.cpp b/extension/tensor/tensor_ptr.cpp index bb76311bd67..a070716037f 100644 --- a/extension/tensor/tensor_ptr.cpp +++ b/extension/tensor/tensor_ptr.cpp @@ -10,6 +10,7 @@ #include +#include #include namespace executorch { @@ -61,7 +62,9 @@ TensorPtr make_tensor_ptr( std::vector strides, executorch::aten::ScalarType type, executorch::aten::TensorShapeDynamism dynamism, - std::function deleter) { + std::function deleter, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index) { const auto dim = sizes.size(); ET_CHECK_MSG( dim_order.empty() || dim_order.size() == dim, @@ -101,6 +104,7 @@ TensorPtr make_tensor_ptr( strides = std::move(computed_strides); + TensorPtr cpu_tensor; #ifndef USE_ATEN_LIB executorch::aten::TensorImpl tensor_impl( type, @@ -116,9 +120,9 @@ TensorPtr make_tensor_ptr( std::move(dim_order), std::move(strides), std::move(deleter)); - const auto tensor_ptr = &storage->tensor; - return std::shared_ptr( - std::move(storage), tensor_ptr); + const auto raw_tensor_ptr = &storage->tensor; + cpu_tensor = std::shared_ptr( + std::move(storage), raw_tensor_ptr); #else auto options = c10::TensorOptions() .dtype(c10::scalarTypeToTypeMeta(type)) @@ -136,8 +140,13 @@ TensorPtr make_tensor_ptr( c10::DispatchKeySet(c10::DispatchKey::CPU), options.dtype()); tensor_impl->set_sizes_and_strides(sizes, strides); - return std::make_shared(std::move(tensor_impl)); + cpu_tensor = + std::make_shared(std::move(tensor_impl)); #endif // USE_ATEN_LIB + if (device_type != runtime::etensor::DeviceType::CPU) { + return clone_tensor_ptr_to_device(cpu_tensor, device_type, device_index); + } + return cpu_tensor; } TensorPtr make_tensor_ptr( @@ -146,7 +155,9 @@ TensorPtr make_tensor_ptr( std::vector dim_order, std::vector strides, executorch::aten::ScalarType type, - executorch::aten::TensorShapeDynamism dynamism) { + executorch::aten::TensorShapeDynamism dynamism, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index) { ET_CHECK_MSG( data.size() == executorch::aten::compute_numel(sizes.data(), sizes.size()) * @@ -161,7 +172,9 @@ TensorPtr make_tensor_ptr( type, dynamism, // Data is moved into the deleter and is destroyed together with Storage. - [data = std::move(data)](void*) {}); + [data = std::move(data)](void*) {}, + device_type, + device_index); } TensorPtr clone_tensor_ptr( @@ -248,5 +261,179 @@ runtime::Error resize_tensor_ptr( sizes.data(), sizes.size())); } +// ---- Device tensor helpers ---- + +namespace { + +#ifndef USE_ATEN_LIB +struct DeviceStorage final { + executorch::aten::TensorImpl tensor_impl; + executorch::aten::Tensor tensor; + std::vector sizes; + std::vector dim_order; + std::vector strides; + std::function deleter; + + DeviceStorage( + executorch::aten::TensorImpl&& tensor_impl, + std::vector&& sizes, + std::vector&& dim_order, + std::vector&& strides, + std::function&& deleter) + : tensor_impl(std::move(tensor_impl)), + tensor(&this->tensor_impl), + sizes(std::move(sizes)), + dim_order(std::move(dim_order)), + strides(std::move(strides)), + deleter(std::move(deleter)) {} + + ~DeviceStorage() { + if (deleter) { + deleter(tensor_impl.mutable_data()); + } + } +}; +#endif // USE_ATEN_LIB + +TensorPtr make_tensor_ptr_with_device( + std::vector sizes, + void* data, + executorch::aten::ScalarType type, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index, + std::function deleter) { + const auto dim = sizes.size(); + std::vector dim_order(dim); + std::iota(dim_order.begin(), dim_order.end(), 0); + + std::vector strides(dim); + if (dim > 0) { + auto error = runtime::dim_order_to_stride( + sizes.data(), dim_order.data(), dim, strides.data()); + ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides."); + } + +#ifndef USE_ATEN_LIB + executorch::aten::TensorImpl tensor_impl( + type, + dim, + sizes.data(), + data, + dim_order.data(), + strides.data(), + dim > 0 ? executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND + : executorch::aten::TensorShapeDynamism::STATIC, + device_type, + device_index); + auto storage = std::make_shared( + std::move(tensor_impl), + std::move(sizes), + std::move(dim_order), + std::move(strides), + std::move(deleter)); + const auto tensor_ptr = &storage->tensor; + return std::shared_ptr( + std::move(storage), tensor_ptr); +#else + (void)device_type; + (void)device_index; + return make_tensor_ptr( + std::move(sizes), + data, + type, + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + std::move(deleter)); +#endif // USE_ATEN_LIB +} + +} // namespace + +TensorPtr clone_tensor_ptr_to_device( + const TensorPtr& cpu_tensor, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index) { + ET_CHECK_MSG( + device_type != runtime::etensor::DeviceType::CPU, + "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies."); + + auto* allocator = runtime::get_device_allocator(device_type); + ET_CHECK_MSG( + allocator != nullptr, + "No device allocator registered for device type %d", + static_cast(device_type)); + + const auto nbytes = cpu_tensor->nbytes(); + const auto* cpu_data = cpu_tensor->const_data_ptr(); + ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data."); + + auto result = allocator->allocate(nbytes, device_index); + ET_CHECK_MSG(result.ok(), "Failed to allocate device memory."); + void* device_data = result.get(); + + auto err = allocator->copy_host_to_device( + device_data, cpu_data, nbytes, device_index); + ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed."); + + std::vector sizes( + cpu_tensor->sizes().begin(), cpu_tensor->sizes().end()); + + return make_tensor_ptr_with_device( + std::move(sizes), + device_data, + cpu_tensor->scalar_type(), + device_type, + device_index, + [allocator, device_index](void* ptr) { + allocator->deallocate(ptr, device_index); + }); +} + +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) { + const auto nbytes = device_tensor->nbytes(); + const auto* device_data = device_tensor->const_data_ptr(); + ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data."); + +#ifndef USE_ATEN_LIB + const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type(); + const auto device_index = + device_tensor->unsafeGetTensorImpl()->device_index(); +#else + const auto& aten_device = device_tensor->device(); + ET_CHECK_MSG(!aten_device.is_cpu(), "Source tensor is already on CPU."); + auto device_type = runtime::etensor::DeviceType::CPU; + if (aten_device.is_cuda()) { + device_type = runtime::etensor::DeviceType::CUDA; + } + const auto device_index = + static_cast(aten_device.index()); +#endif + + ET_CHECK_MSG( + device_type != runtime::etensor::DeviceType::CPU, + "Source tensor is already on CPU."); + + auto* allocator = runtime::get_device_allocator(device_type); + ET_CHECK_MSG( + allocator != nullptr, + "No device allocator registered for device type %d", + static_cast(device_type)); + + std::vector cpu_data(nbytes); + + auto err = allocator->copy_device_to_host( + cpu_data.data(), device_data, nbytes, device_index); + ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed."); + + std::vector sizes( + device_tensor->sizes().begin(), device_tensor->sizes().end()); + + return make_tensor_ptr( + std::move(sizes), + std::move(cpu_data), + {}, + {}, + device_tensor->scalar_type()); +} + } // namespace extension } // namespace executorch diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h index 47124bdeca6..22f7fe7d8c1 100644 --- a/extension/tensor/tensor_ptr.h +++ b/extension/tensor/tensor_ptr.h @@ -17,6 +17,7 @@ #include #include #include +#include C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum") @@ -40,6 +41,8 @@ using TensorPtr = std::shared_ptr; * @param deleter A custom deleter function for managing the lifetime of the * data buffer. If provided, this deleter will be called when the managed Tensor * object is destroyed. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created Tensor. */ TensorPtr make_tensor_ptr( @@ -51,7 +54,10 @@ TensorPtr make_tensor_ptr( executorch::aten::ScalarType::Float, const executorch::aten::TensorShapeDynamism dynamism = executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, - std::function deleter = nullptr); + std::function deleter = nullptr, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0); /** * Creates a TensorPtr that manages a Tensor with the specified properties. @@ -63,6 +69,8 @@ TensorPtr make_tensor_ptr( * @param deleter A custom deleter function for managing the lifetime of the * data buffer. If provided, this deleter will be called when the managed Tensor * object is destroyed. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created Tensor. */ inline TensorPtr make_tensor_ptr( @@ -72,9 +80,20 @@ inline TensorPtr make_tensor_ptr( executorch::aten::ScalarType::Float, const executorch::aten::TensorShapeDynamism dynamism = executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, - std::function deleter = nullptr) { + std::function deleter = nullptr, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { return make_tensor_ptr( - std::move(sizes), data, {}, {}, type, dynamism, std::move(deleter)); + std::move(sizes), + data, + {}, + {}, + type, + dynamism, + std::move(deleter), + device_type, + device_index); } /** @@ -95,6 +114,8 @@ inline TensorPtr make_tensor_ptr( * @param type The scalar type of the tensor elements. If it differs from the * deduced type, the data will be cast to this type if allowed. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created TensorImpl. */ template < @@ -108,7 +129,10 @@ inline TensorPtr make_tensor_ptr( std::vector strides = {}, executorch::aten::ScalarType type = deduced_type, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) { + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { ET_CHECK_MSG( data.size() == executorch::aten::compute_numel(sizes.data(), sizes.size()), @@ -144,7 +168,9 @@ inline TensorPtr make_tensor_ptr( std::move(strides), type, dynamism, - [data_ptr = std::move(data_ptr)](void*) {}); + [data_ptr = std::move(data_ptr)](void*) {}, + device_type, + device_index); } const auto raw_data_ptr = data.data(); auto data_ptr = std::make_shared>(std::move(data)); @@ -155,7 +181,9 @@ inline TensorPtr make_tensor_ptr( std::move(strides), type, dynamism, - [data_ptr = std::move(data_ptr)](void*) {}); + [data_ptr = std::move(data_ptr)](void*) {}, + device_type, + device_index); } /** @@ -173,6 +201,8 @@ inline TensorPtr make_tensor_ptr( * @param type The scalar type of the tensor elements. If it differs from the * deduced type, the data will be cast to this type if allowed. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created TensorImpl. */ template < @@ -183,11 +213,21 @@ inline TensorPtr make_tensor_ptr( std::vector data, executorch::aten::ScalarType type = deduced_type, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) { + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { std::vector sizes{ executorch::aten::SizesType(data.size())}; return make_tensor_ptr( - std::move(sizes), std::move(data), {0}, {1}, type, dynamism); + std::move(sizes), + std::move(data), + {0}, + {1}, + type, + dynamism, + device_type, + device_index); } /** @@ -210,6 +250,8 @@ inline TensorPtr make_tensor_ptr( * @param type The scalar type of the tensor elements. If it differs from the * deduced type, the data will be cast to this type if allowed. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created TensorImpl. */ template < @@ -223,14 +265,19 @@ inline TensorPtr make_tensor_ptr( std::vector strides = {}, executorch::aten::ScalarType type = deduced_type, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) { + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { return make_tensor_ptr( std::move(sizes), std::vector(std::move(list)), std::move(dim_order), std::move(strides), type, - dynamism); + dynamism, + device_type, + device_index); } /** @@ -250,6 +297,8 @@ inline TensorPtr make_tensor_ptr( * @param type The scalar type of the tensor elements. If it differs from the * deduced type, the data will be cast to this type if allowed. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr that manages the newly created TensorImpl. */ template < @@ -260,11 +309,21 @@ inline TensorPtr make_tensor_ptr( std::initializer_list list, executorch::aten::ScalarType type = deduced_type, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) { + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { std::vector sizes{ executorch::aten::SizesType(list.size())}; return make_tensor_ptr( - std::move(sizes), std::move(list), {0}, {1}, type, dynamism); + std::move(sizes), + std::move(list), + {0}, + {1}, + type, + dynamism, + device_type, + device_index); } /** @@ -293,6 +352,8 @@ inline TensorPtr make_tensor_ptr(T value) { * @param strides A vector specifying the strides of each dimension. * @param type The scalar type of the tensor elements. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr managing the newly created Tensor. */ TensorPtr make_tensor_ptr( @@ -302,7 +363,10 @@ TensorPtr make_tensor_ptr( std::vector strides, executorch::aten::ScalarType type = executorch::aten::ScalarType::Float, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND); + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0); /** * Creates a TensorPtr that manages a Tensor with the specified properties. @@ -315,6 +379,8 @@ TensorPtr make_tensor_ptr( * @param data A vector containing the raw memory for the tensor's data. * @param type The scalar type of the tensor elements. * @param dynamism Specifies the mutability of the tensor's shape. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). * @return A TensorPtr managing the newly created Tensor. */ inline TensorPtr make_tensor_ptr( @@ -322,9 +388,19 @@ inline TensorPtr make_tensor_ptr( std::vector data, executorch::aten::ScalarType type = executorch::aten::ScalarType::Float, executorch::aten::TensorShapeDynamism dynamism = - executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) { + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { return make_tensor_ptr( - std::move(sizes), std::move(data), {}, {}, type, dynamism); + std::move(sizes), + std::move(data), + {}, + {}, + type, + dynamism, + device_type, + device_index); } /** @@ -388,27 +464,59 @@ inline TensorPtr make_tensor_ptr( std::move(deleter)); } +/** + * Clones a CPU TensorPtr to a device TensorPtr. + * + * Allocates memory on the specified device and copies the tensor data from + * host to device using the DeviceAllocator registered for the given device + * type. The returned TensorPtr owns the device memory and will free it via + * the allocator when destroyed. + * + * Forward declaration to support make_tensor_ptr below usage. + * + * @param cpu_tensor The source CPU tensor whose data will be copied. + * @param device_type The target device type (e.g., DeviceType::CUDA). + * @param device_index The target device index (default 0). + * @return A TensorPtr backed by device memory containing the copied data. + */ +TensorPtr clone_tensor_ptr_to_device( + const TensorPtr& cpu_tensor, + runtime::etensor::DeviceType device_type, + runtime::etensor::DeviceIndex device_index = 0); + /** * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...). * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed. + * When device_type is not CPU, the tensor data is additionally copied to the + * specified device. * * @param tensor_ptr The source tensor pointer to alias. * @param sizes Optional sizes override. * @param dim_order Optional dimension order override. * @param strides Optional strides override. - * @return A TensorPtr aliasing the same storage with requested metadata. + * @param device_type The target device type (default CPU, meaning no copy). + * @param device_index The target device index (default 0). + * @return A TensorPtr aliasing the same storage with requested metadata, or a + * device TensorPtr if device_type is not CPU. */ inline TensorPtr make_tensor_ptr( const TensorPtr& tensor_ptr, std::vector sizes = {}, std::vector dim_order = {}, - std::vector strides = {}) { - return make_tensor_ptr( + std::vector strides = {}, + runtime::etensor::DeviceType device_type = + runtime::etensor::DeviceType::CPU, + runtime::etensor::DeviceIndex device_index = 0) { + auto result = make_tensor_ptr( *tensor_ptr, std::move(sizes), std::move(dim_order), std::move(strides), [tensor_ptr](void*) {}); + if (device_type != runtime::etensor::DeviceType::CPU) { + return clone_tensor_ptr_to_device(result, device_type, device_index); + } + return result; } /** @@ -479,6 +587,18 @@ runtime::Error resize_tensor_ptr( TensorPtr& tensor, const std::vector& sizes); +/** + * Clones a device TensorPtr to a CPU TensorPtr. + * + * Allocates host memory and copies the tensor data from device to host using + * the DeviceAllocator registered for the source tensor's device type. The + * device type is determined from the source tensor's metadata. + * + * @param device_tensor The source device tensor whose data will be copied. + * @return A TensorPtr backed by CPU memory containing the copied data. + */ +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor); + } // namespace extension } // namespace executorch diff --git a/extension/tensor/test/targets.bzl b/extension/tensor/test/targets.bzl index 5bf8c7019b8..807e16ec8c1 100644 --- a/extension/tensor/test/targets.bzl +++ b/extension/tensor/test/targets.bzl @@ -21,3 +21,14 @@ def define_common_targets(): "//executorch/extension/tensor:tensor" + aten_suffix, ], ) + + runtime.cxx_test( + name = "tensor_ptr_device_test" + aten_suffix, + srcs = [ + "tensor_ptr_device_test.cpp", + ], + deps = [ + "//executorch/extension/tensor:tensor" + aten_suffix, + "//executorch/runtime/core:device_allocator", + ], + ) diff --git a/extension/tensor/test/tensor_ptr_device_test.cpp b/extension/tensor/test/tensor_ptr_device_test.cpp new file mode 100644 index 00000000000..41a002b9d2b --- /dev/null +++ b/extension/tensor/test/tensor_ptr_device_test.cpp @@ -0,0 +1,447 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +using namespace ::executorch::extension; +using namespace ::executorch::runtime; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +// A fake device allocator that uses host memory (malloc/free/memcpy) to +// simulate device memory operations, enabling end-to-end data roundtrip +// verification without requiring actual device hardware. +class FakeDeviceAllocator : public DeviceAllocator { + public: + explicit FakeDeviceAllocator(DeviceType type) : type_(type) {} + + Result allocate(size_t nbytes, DeviceIndex /*index*/) override { + void* ptr = std::malloc(nbytes); + if (!ptr) { + return Error::MemoryAllocationFailed; + } + allocate_count_++; + return ptr; + } + + void deallocate(void* ptr, DeviceIndex /*index*/) override { + std::free(ptr); + deallocate_count_++; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex /*index*/) override { + std::memcpy(dst, src, nbytes); + h2d_count_++; + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex /*index*/) override { + std::memcpy(dst, src, nbytes); + d2h_count_++; + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + int h2d_count_ = 0; + int d2h_count_ = 0; + + private: + DeviceType type_; +}; + +FakeDeviceAllocator g_fake_cuda_allocator(DeviceType::CUDA); + +struct RegisterFakeAllocator { + RegisterFakeAllocator() { + register_device_allocator(DeviceType::CUDA, &g_fake_cuda_allocator); + } +}; +static RegisterFakeAllocator s_register; + +} // namespace + +class TensorPtrDeviceTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + runtime_init(); + } + + void SetUp() override { + g_fake_cuda_allocator.allocate_count_ = 0; + g_fake_cuda_allocator.deallocate_count_ = 0; + g_fake_cuda_allocator.h2d_count_ = 0; + g_fake_cuda_allocator.d2h_count_ = 0; + } +}; + +TEST_F(TensorPtrDeviceTest, CpuToDeviceTensor) { + auto cpu_tensor = + make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 3); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_NE(device_tensor->const_data_ptr(), nullptr); + EXPECT_NE(device_tensor->const_data_ptr(), cpu_tensor->const_data_ptr()); + +#ifndef USE_ATEN_LIB + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 0); +#endif + + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1); +} + +TEST_F(TensorPtrDeviceTest, CpuToDeviceFromRawData) { + float data[] = {10.0f, 20.0f, 30.0f, 40.0f}; + auto cpu_tensor = make_tensor_ptr({2, 2}, data); + auto device_tensor = + make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 2); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_NE(device_tensor->const_data_ptr(), nullptr); + EXPECT_NE(device_tensor->const_data_ptr(), static_cast(data)); + +#ifndef USE_ATEN_LIB + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); +#endif + + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1); +} + +#ifndef USE_ATEN_LIB +// clone_tensor_ptr_to_cpu relies on TensorImpl device metadata which is only +// available in the non-ATen (ExecuTorch portable) path. +TEST_F(TensorPtrDeviceTest, DeviceToCpuTensor) { + auto cpu_tensor = + make_tensor_ptr({2, 3}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto result_tensor = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(result_tensor->dim(), 2); + EXPECT_EQ(result_tensor->size(0), 2); + EXPECT_EQ(result_tensor->size(1), 3); + EXPECT_EQ(result_tensor->scalar_type(), executorch::aten::ScalarType::Float); + + auto* result_data = result_tensor->const_data_ptr(); + auto* original_data = cpu_tensor->const_data_ptr(); + for (int i = 0; i < 6; ++i) { + EXPECT_FLOAT_EQ(result_data[i], original_data[i]); + } + + EXPECT_EQ(g_fake_cuda_allocator.d2h_count_, 1); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, RoundtripCpuDeviceCpu) { + const std::vector original = {1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f}; + auto cpu_tensor = make_tensor_ptr({2, 3}, original); + + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip_tensor = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_NE(roundtrip_tensor->const_data_ptr(), cpu_tensor->const_data_ptr()); + EXPECT_NE( + roundtrip_tensor->const_data_ptr(), device_tensor->const_data_ptr()); + + auto* result_data = roundtrip_tensor->const_data_ptr(); + for (size_t i = 0; i < original.size(); ++i) { + EXPECT_FLOAT_EQ(result_data[i], original[i]); + } + + EXPECT_EQ(roundtrip_tensor->dim(), cpu_tensor->dim()); + EXPECT_EQ(roundtrip_tensor->size(0), cpu_tensor->size(0)); + EXPECT_EQ(roundtrip_tensor->size(1), cpu_tensor->size(1)); + EXPECT_EQ(roundtrip_tensor->scalar_type(), cpu_tensor->scalar_type()); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, RoundtripInt32) { + auto cpu_tensor = make_tensor_ptr({4}, std::vector{10, 20, 30, 40}); + + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Int); + const std::vector expected = {10, 20, 30, 40}; + auto* data = roundtrip->const_data_ptr(); + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(data[i], expected[i]); + } +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, DeviceIndexPropagation) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + auto device_tensor = clone_tensor_ptr_to_device( + cpu_tensor, DeviceType::CUDA, /*device_index=*/1); + + EXPECT_EQ(device_tensor->unsafeGetTensorImpl()->device_index(), 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 1.0f); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[1], 2.0f); +} +#endif + +TEST_F(TensorPtrDeviceTest, DeviceMemoryCleanup) { + { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + auto device_tensor = + clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1); + EXPECT_EQ(g_fake_cuda_allocator.deallocate_count_, 0); + } + EXPECT_EQ(g_fake_cuda_allocator.deallocate_count_, 1); +} + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, ScalarTensorRoundtrip) { + auto cpu_tensor = make_tensor_ptr({}, {42.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 0); + EXPECT_EQ(device_tensor->numel(), 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + EXPECT_EQ(roundtrip->dim(), 0); + EXPECT_EQ(roundtrip->numel(), 1); + EXPECT_FLOAT_EQ(roundtrip->const_data_ptr()[0], 42.0f); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, RawDataRoundtrip) { + float raw_data[] = {100.0f, 200.0f, 300.0f}; + auto cpu_tensor = make_tensor_ptr({3}, raw_data); + auto device_tensor = + make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->dim(), 1); + EXPECT_EQ(roundtrip->size(0), 3); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 100.0f); + EXPECT_FLOAT_EQ(data[1], 200.0f); + EXPECT_FLOAT_EQ(data[2], 300.0f); +} +#endif + +TEST_F(TensorPtrDeviceTest, ErrorCpuTargetDevice) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + ET_EXPECT_DEATH(clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CPU), ""); +} + +TEST_F(TensorPtrDeviceTest, ErrorNullRawData) { + auto null_tensor = make_tensor_ptr({2, 2}, nullptr); + ET_EXPECT_DEATH( + make_tensor_ptr(null_tensor, {}, {}, {}, DeviceType::CUDA), ""); +} + +TEST_F(TensorPtrDeviceTest, ErrorNullCpuTensorData) { + auto null_tensor = make_tensor_ptr({2, 2}, nullptr); + ET_EXPECT_DEATH( + clone_tensor_ptr_to_device(null_tensor, DeviceType::CUDA), ""); +} + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, ErrorCpuTensorToCpu) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + ET_EXPECT_DEATH(clone_tensor_ptr_to_cpu(cpu_tensor), ""); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, MakeTensorPtrVectorToDevice) { + auto device_tensor = make_tensor_ptr( + {2, 2}, + std::vector{1.0f, 2.0f, 3.0f, 4.0f}, + {}, + {}, + executorch::aten::ScalarType::Float, + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 2); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 2); + EXPECT_EQ(device_tensor->scalar_type(), executorch::aten::ScalarType::Float); + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 1.0f); + EXPECT_FLOAT_EQ(data[1], 2.0f); + EXPECT_FLOAT_EQ(data[2], 3.0f); + EXPECT_FLOAT_EQ(data[3], 4.0f); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, MakeTensorPtrRawPointerToDevice) { + float raw[] = {5.0f, 6.0f, 7.0f}; + auto device_tensor = make_tensor_ptr( + {3}, + raw, + executorch::aten::ScalarType::Float, + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, + nullptr, + DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 1); + EXPECT_EQ(device_tensor->size(0), 3); + EXPECT_EQ( + device_tensor->unsafeGetTensorImpl()->device_type(), DeviceType::CUDA); + EXPECT_NE(device_tensor->const_data_ptr(), static_cast(raw)); + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 1); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 1); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* data = roundtrip->const_data_ptr(); + EXPECT_FLOAT_EQ(data[0], 5.0f); + EXPECT_FLOAT_EQ(data[1], 6.0f); + EXPECT_FLOAT_EQ(data[2], 7.0f); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, CloneToCpuVerifiesCpuDeviceMetadata) { + auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto result = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(result->unsafeGetTensorImpl()->device_type(), DeviceType::CPU); + EXPECT_EQ(result->unsafeGetTensorImpl()->device_index(), 0); +} +#endif + +TEST_F(TensorPtrDeviceTest, MultipleClonesFromSameSource) { + auto cpu_tensor = make_tensor_ptr({3}, {1.0f, 2.0f, 3.0f}); + auto device1 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto device2 = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_NE(device1->const_data_ptr(), device2->const_data_ptr()); + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 2); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 2); +} + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, HighDimensionalTensorRoundtrip) { + std::vector data(24); + for (size_t i = 0; i < 24; ++i) { + data[i] = static_cast(i); + } + auto cpu_tensor = make_tensor_ptr({2, 3, 4}, data); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + + EXPECT_EQ(device_tensor->dim(), 3); + EXPECT_EQ(device_tensor->size(0), 2); + EXPECT_EQ(device_tensor->size(1), 3); + EXPECT_EQ(device_tensor->size(2), 4); + + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + auto* result = roundtrip->const_data_ptr(); + for (size_t i = 0; i < 24; ++i) { + EXPECT_FLOAT_EQ(result[i], static_cast(i)); + } +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, RoundtripDouble) { + auto cpu_tensor = make_tensor_ptr({3}, std::vector{1.1, 2.2, 3.3}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Double); + auto* data = roundtrip->const_data_ptr(); + EXPECT_DOUBLE_EQ(data[0], 1.1); + EXPECT_DOUBLE_EQ(data[1], 2.2); + EXPECT_DOUBLE_EQ(data[2], 3.3); +} +#endif + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, RoundtripInt64) { + auto cpu_tensor = make_tensor_ptr({3}, std::vector{100, 200, 300}); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + EXPECT_EQ(roundtrip->scalar_type(), executorch::aten::ScalarType::Long); + auto* data = roundtrip->const_data_ptr(); + EXPECT_EQ(data[0], 100); + EXPECT_EQ(data[1], 200); + EXPECT_EQ(data[2], 300); +} +#endif + +TEST_F(TensorPtrDeviceTest, CpuToCpuDefaultPreserved) { + auto cpu_tensor = make_tensor_ptr({2}, {1.0f, 2.0f}); + auto result = make_tensor_ptr(cpu_tensor, {}, {}, {}, DeviceType::CPU); + + EXPECT_EQ(g_fake_cuda_allocator.allocate_count_, 0); + EXPECT_EQ(g_fake_cuda_allocator.h2d_count_, 0); +} + +#ifndef USE_ATEN_LIB +TEST_F(TensorPtrDeviceTest, LargeTensorRoundtrip) { + const size_t n = 10000; + std::vector data(n); + for (size_t i = 0; i < n; ++i) { + data[i] = static_cast(i) * 0.1f; + } + auto cpu_tensor = make_tensor_ptr({static_cast(n)}, data); + auto device_tensor = clone_tensor_ptr_to_device(cpu_tensor, DeviceType::CUDA); + auto roundtrip = clone_tensor_ptr_to_cpu(device_tensor); + + auto* result = roundtrip->const_data_ptr(); + for (size_t i = 0; i < n; ++i) { + EXPECT_FLOAT_EQ(result[i], data[i]); + } +} +#endif