From 911f8985c8335c4366a0ea8a6349f56be80320d9 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 24 Mar 2026 14:24:10 -0700 Subject: [PATCH] [ET Device Support] DeviceMemoryBuffer RAII class for device memory lifetime management Introduces DeviceMemoryBuffer, an RAII wrapper that owns a single device memory allocation. On destruction, it automatically calls DeviceAllocator::deallocate() to free the memory. This mirrors the role of std::vector for CPU planned buffers, but for non-cpu device memory (CUDA, etc.). Key features: - Static factory create(size, type, index) looks up DeviceAllocator from registry - Move-only semantics (no copy) to enforce single ownership - as_span() accessor wraps device pointer for use with HierarchicalAllocator - Destructor is no-op for default-constructed or moved-from instances Differential Revision: [D97850709](https://our.internmc.facebook.com/intern/diff/D97850709/) [ghstack-poisoned] --- runtime/core/device_memory_buffer.cpp | 34 ++++ runtime/core/device_memory_buffer.h | 126 +++++++++++++ runtime/core/portable_type/targets.bzl | 1 + runtime/core/targets.bzl | 27 +++ .../core/test/device_memory_buffer_test.cpp | 169 ++++++++++++++++++ runtime/core/test/targets.bzl | 8 + 6 files changed, 365 insertions(+) create mode 100644 runtime/core/device_memory_buffer.cpp create mode 100644 runtime/core/device_memory_buffer.h create mode 100644 runtime/core/test/device_memory_buffer_test.cpp diff --git a/runtime/core/device_memory_buffer.cpp b/runtime/core/device_memory_buffer.cpp new file mode 100644 index 00000000000..7eb3f0e3ae2 --- /dev/null +++ b/runtime/core/device_memory_buffer.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch::runtime { + +Result DeviceMemoryBuffer::create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index) { + DeviceAllocator* allocator = get_device_allocator(type); + if (allocator == nullptr) { + ET_LOG( + Error, + "No device allocator registered for device type %d", + static_cast(type)); + return Error::NotFound; + } + + auto result = allocator->allocate(size, index); + if (!result.ok()) { + return result.error(); + } + + return DeviceMemoryBuffer(result.get(), size, allocator, index); +} + +} // namespace executorch::runtime diff --git a/runtime/core/device_memory_buffer.h b/runtime/core/device_memory_buffer.h new file mode 100644 index 00000000000..7071f3de58d --- /dev/null +++ b/runtime/core/device_memory_buffer.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace executorch::runtime { + +/** + * RAII wrapper that owns a single device memory allocation. + * + * On destruction, calls DeviceAllocator::deallocate() to free the memory. + * This mirrors the role of std::vector for CPU planned buffers, + * but for device memory (CUDA, etc.). + * + * Move-only: cannot be copied, but can be moved to transfer ownership. + */ +class DeviceMemoryBuffer final { + public: + /** + * Creates a DeviceMemoryBuffer by allocating device memory. + * + * Looks up the DeviceAllocator for the given device type via the + * DeviceAllocatorRegistry. If no allocator is registered for the type, + * returns Error::NotFound. + * + * @param size Number of bytes to allocate. + * @param type The device type (e.g., CUDA). + * @param index The device index (e.g., 0 for cuda:0). + * @return A Result containing the DeviceMemoryBuffer on success, or an error. + */ + static Result create( + size_t size, + etensor::DeviceType type, + etensor::DeviceIndex index = 0); + + DeviceMemoryBuffer() = default; + + ~DeviceMemoryBuffer() { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + } + + // Move constructor: transfer ownership. + DeviceMemoryBuffer(DeviceMemoryBuffer&& other) noexcept + : ptr_(other.ptr_), + size_(other.size_), + allocator_(other.allocator_), + device_index_(other.device_index_) { + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + + // Move assignment: release current, take ownership. + DeviceMemoryBuffer& operator=(DeviceMemoryBuffer&& other) noexcept { + if (this != &other) { + if (ptr_ != nullptr && allocator_ != nullptr) { + allocator_->deallocate(ptr_, device_index_); + } + ptr_ = other.ptr_; + size_ = other.size_; + allocator_ = other.allocator_; + device_index_ = other.device_index_; + other.ptr_ = nullptr; + other.size_ = 0; + other.allocator_ = nullptr; + } + return *this; + } + + // Non-copyable. + DeviceMemoryBuffer(const DeviceMemoryBuffer&) = delete; + DeviceMemoryBuffer& operator=(const DeviceMemoryBuffer&) = delete; + + /// Returns the device pointer, or nullptr if empty/moved-from. + void* data() const { + return ptr_; + } + + /// Returns the size in bytes of the allocation. + size_t size() const { + return size_; + } + + /** + * Returns a Span wrapping the device pointer. + * + * This is intended for use with HierarchicalAllocator, which only performs + * pointer arithmetic on the span data and never dereferences it. Device + * pointers are valid for pointer arithmetic from the CPU side. + */ + Span as_span() const { + return {static_cast(ptr_), size_}; + } + + private: + DeviceMemoryBuffer( + void* ptr, + size_t size, + DeviceAllocator* allocator, + etensor::DeviceIndex device_index) + : ptr_(ptr), + size_(size), + allocator_(allocator), + device_index_(device_index) {} + + void* ptr_ = nullptr; + size_t size_ = 0; + DeviceAllocator* allocator_ = nullptr; + etensor::DeviceIndex device_index_ = 0; +}; + +} // namespace executorch::runtime diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 5b6e67fa213..33f18c68006 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -27,6 +27,7 @@ def define_common_targets(): "//executorch/backends/...", "//executorch/extension/fb/dynamic_shim/...", "//executorch/kernels/portable/cpu/...", + "//executorch/runtime/core/...", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/runtime/core/targets.bzl b/runtime/core/targets.bzl index 2c13cdbdae3..9b40e947626 100644 --- a/runtime/core/targets.bzl +++ b/runtime/core/targets.bzl @@ -141,6 +141,33 @@ def define_common_targets(): visibility = ["//executorch/..."], ) + runtime.cxx_library( + name = "device_allocator", + srcs = ["device_allocator.cpp"], + exported_headers = [ + "device_allocator.h", + ], + exported_deps = [ + ":core", + "//executorch/runtime/core/portable_type:portable_type", + ], + deps = [ + "//executorch/runtime/platform:platform", + ], + visibility = ["PUBLIC"], + ) + + runtime.cxx_library( + name = "device_memory_buffer", + srcs = ["device_memory_buffer.cpp"], + exported_headers = ["device_memory_buffer.h"], + exported_deps = [ + ":core", + ":device_allocator", + ], + visibility = ["PUBLIC"], + ) + runtime.cxx_library( name = "tag", srcs = ["tag.cpp"], diff --git a/runtime/core/test/device_memory_buffer_test.cpp b/runtime/core/test/device_memory_buffer_test.cpp new file mode 100644 index 00000000000..81d0a757cf4 --- /dev/null +++ b/runtime/core/test/device_memory_buffer_test.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::Result; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +/** + * A mock DeviceAllocator for testing DeviceMemoryBuffer. + * Returns pointers into a local buffer and tracks call counts. + */ +class MockAllocator : public DeviceAllocator { + public: + explicit MockAllocator(DeviceType type) : type_(type) {} + + Result allocate(size_t nbytes, DeviceIndex index) override { + allocate_count_++; + last_allocate_size_ = nbytes; + return static_cast(buffer_); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + last_deallocate_ptr_ = ptr; + } + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return type_; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + void* last_deallocate_ptr_ = nullptr; + uint8_t buffer_[256] = {}; + + private: + DeviceType type_; +}; + +// Global mock registered once before all tests run. +static MockAllocator g_mock_cuda(DeviceType::CUDA); + +class DeviceMemoryBufferTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(DeviceType::CUDA, &g_mock_cuda); + } + + void SetUp() override { + // Reset counters before each test. + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_deallocate_ptr_ = nullptr; + } +}; + +TEST_F(DeviceMemoryBufferTest, DefaultConstructedIsEmpty) { + DeviceMemoryBuffer buf; + EXPECT_EQ(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 0); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), nullptr); + EXPECT_EQ(span.size(), 0); +} + +TEST_F(DeviceMemoryBufferTest, CreateAllocatesAndDestructorDeallocates) { + { + auto result = DeviceMemoryBuffer::create(1024, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + + auto buf = std::move(result.get()); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 1024); + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 1024); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_deallocate_ptr_, g_mock_cuda.buffer_); +} + +TEST_F(DeviceMemoryBufferTest, CreateFailsWithNoRegisteredAllocator) { + auto result = DeviceMemoryBuffer::create(512, DeviceType::CPU, 0); + EXPECT_FALSE(result.ok()); + EXPECT_EQ(result.error(), Error::NotFound); +} + +TEST_F(DeviceMemoryBufferTest, MoveConstructorTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(256, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer moved(std::move(original)); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(original.size(), 0); + EXPECT_EQ(moved.data(), original_ptr); + EXPECT_EQ(moved.size(), 256); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, MoveAssignmentTransfersOwnership) { + auto result = DeviceMemoryBuffer::create(128, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto original = std::move(result.get()); + void* original_ptr = original.data(); + + DeviceMemoryBuffer target; + target = std::move(original); + + EXPECT_EQ(original.data(), nullptr); + EXPECT_EQ(target.data(), original_ptr); + EXPECT_EQ(target.size(), 128); + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, DestructorNoOpForDefaultConstructed) { + { + DeviceMemoryBuffer buf; + } + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); +} + +TEST_F(DeviceMemoryBufferTest, AsSpanWrapsDevicePointer) { + auto result = DeviceMemoryBuffer::create(2048, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 2048); +} diff --git a/runtime/core/test/targets.bzl b/runtime/core/test/targets.bzl index 1ad0940c62e..0436d3e10dd 100644 --- a/runtime/core/test/targets.bzl +++ b/runtime/core/test/targets.bzl @@ -7,6 +7,14 @@ def define_common_targets(): TARGETS and BUCK files that call this function. """ + runtime.cxx_test( + name = "device_memory_buffer_test", + srcs = ["device_memory_buffer_test.cpp"], + deps = [ + "//executorch/runtime/core:device_memory_buffer", + ], + ) + runtime.cxx_test( name = "span_test", srcs = ["span_test.cpp"],