Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/core/portable_type/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def define_common_targets():
"//executorch/extension/fb/dynamic_shim/...",
"//executorch/kernels/portable/cpu/...",
"//executorch/runtime/core/...",
"//executorch/runtime/executor/...",
"//executorch/runtime/core/exec_aten/...",
"//executorch/runtime/core/portable_type/test/...",
],
Expand Down
46 changes: 46 additions & 0 deletions runtime/executor/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#include <executorch/runtime/core/hierarchical_allocator.h>
#include <executorch/runtime/core/memory_allocator.h>
#include <executorch/runtime/core/portable_type/device.h>
#include <executorch/runtime/core/span.h>

namespace executorch {
namespace runtime {
Expand Down Expand Up @@ -61,6 +63,32 @@ class MemoryManager final {
"method allocator cannot be the same as temp allocator");
}

/**
* Constructs a new MemoryManager with per-buffer device metadata.
*
* @param[in] method_allocator Same as above.
* @param[in] planned_memory Same as above. May contain a mix of CPU and
* device pointers — HierarchicalAllocator only does pointer arithmetic,
* so device pointers are valid.
* @param[in] temp_allocator Same as above.
* @param[in] planned_buffer_devices One entry per planned memory buffer
* (same count as planned_memory buffers), indicating the device type for
* each buffer. For CPU-only programs, use the 3-arg constructor instead.
*/
MemoryManager(
MemoryAllocator* method_allocator,
HierarchicalAllocator* planned_memory,
MemoryAllocator* temp_allocator,
Span<const etensor::DeviceType> planned_buffer_devices)
: method_allocator_(method_allocator),
planned_memory_(planned_memory),
temp_allocator_(temp_allocator),
planned_buffer_devices_(planned_buffer_devices) {
ET_CHECK_MSG(
method_allocator != temp_allocator,
"method allocator cannot be the same as temp allocator");
}

/**
* DEPRECATED: Use the constructor without `constant_allocator` instead.
*
Expand Down Expand Up @@ -105,10 +133,28 @@ class MemoryManager final {
return temp_allocator_;
}

/**
* Returns per-buffer device metadata. One entry per planned memory buffer,
* same count as planned_memory buffers. Empty if no device metadata was
* provided (CPU-only program).
*/
Span<const etensor::DeviceType> planned_buffer_devices() const {
return planned_buffer_devices_;
}

/**
* Returns true if any planned buffer is on a non-CPU device.
* When false, the memory setup is CPU-only and follows the legacy path.
*/
bool has_device_memory() const {
return planned_buffer_devices_.size() > 0;
}

private:
MemoryAllocator* method_allocator_;
HierarchicalAllocator* planned_memory_;
MemoryAllocator* temp_allocator_;
Span<const etensor::DeviceType> planned_buffer_devices_;
};

} // namespace runtime
Expand Down
1 change: 1 addition & 0 deletions runtime/executor/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def define_common_targets():
],
exported_deps = [
"//executorch/runtime/core:memory_allocator",
"//executorch/runtime/core/portable_type:portable_type",
],
visibility = ["PUBLIC"],
)
Expand Down
44 changes: 44 additions & 0 deletions runtime/executor/test/memory_manager_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ using namespace ::testing;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::Span;
using executorch::runtime::etensor::DeviceType;

TEST(MemoryManagerTest, MinimalCtor) {
MemoryAllocator method_allocator(0, nullptr);
Expand Down Expand Up @@ -93,3 +95,45 @@ TEST(MemoryManagerTest, CtorWithSameAllocator) {
/*temp_allocator=*/&method_allocator),
"cannot be the same");
}

TEST(MemoryManagerTest, ThreeArgCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);
HierarchicalAllocator planned_memory({});
MemoryAllocator temp_allocator(0, nullptr);

MemoryManager mm(&method_allocator, &planned_memory, &temp_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}

TEST(MemoryManagerTest, FourArgCtorWithDeviceMetadata) {
MemoryAllocator method_allocator(0, nullptr);
HierarchicalAllocator planned_memory({});
MemoryAllocator temp_allocator(0, nullptr);

// 3 buffers: CPU, CUDA, CPU
DeviceType devices[] = {DeviceType::CPU, DeviceType::CUDA, DeviceType::CPU};
Span<const DeviceType> device_span(devices, 3);

MemoryManager mm(
&method_allocator, &planned_memory, &temp_allocator, device_span);

EXPECT_EQ(mm.method_allocator(), &method_allocator);
EXPECT_EQ(mm.planned_memory(), &planned_memory);
EXPECT_EQ(mm.temp_allocator(), &temp_allocator);
EXPECT_TRUE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 3);
EXPECT_EQ(mm.planned_buffer_devices()[0], DeviceType::CPU);
EXPECT_EQ(mm.planned_buffer_devices()[1], DeviceType::CUDA);
EXPECT_EQ(mm.planned_buffer_devices()[2], DeviceType::CPU);
}

TEST(MemoryManagerTest, MinimalCtorHasNoDeviceMemory) {
MemoryAllocator method_allocator(0, nullptr);

MemoryManager mm(&method_allocator);

EXPECT_FALSE(mm.has_device_memory());
EXPECT_EQ(mm.planned_buffer_devices().size(), 0);
}
3 changes: 3 additions & 0 deletions runtime/executor/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def define_common_targets(is_fbcode = False):
"//executorch/exir/backend/test/...",
"//executorch/runtime/backend/...",
"//executorch/extension/pybindings/...",
"//executorch/extension/module/test/...",
"//executorch/devtools/fb/runners/...",
"//executorch/test/...",
"//executorch/examples/...",
Expand Down Expand Up @@ -326,6 +327,8 @@ def define_common_targets(is_fbcode = False):
deps = [
":managed_memory_manager",
"//executorch/runtime/executor:program",
"//executorch/runtime/core:device_allocator",
"//executorch/runtime/core:device_memory_buffer",
"//executorch/extension/data_loader:file_data_loader",
"//executorch/schema:program",
],
Expand Down
169 changes: 169 additions & 0 deletions runtime/executor/test/tensor_parser_device_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,31 @@
#include <executorch/runtime/executor/tensor_parser.h>

#include <executorch/extension/data_loader/file_data_loader.h>
#include <executorch/runtime/core/device_allocator.h>
#include <executorch/runtime/core/device_memory_buffer.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/executor/test/managed_memory_manager.h>
#include <executorch/runtime/platform/runtime.h>
#include <executorch/schema/program_generated.h>

#include <gtest/gtest.h>

using executorch::aten::Tensor;
using executorch::runtime::DeviceAllocator;
using executorch::runtime::DeviceMemoryBuffer;
using executorch::runtime::Error;
using executorch::runtime::get_device_allocator;
using executorch::runtime::HierarchicalAllocator;
using executorch::runtime::MemoryAllocator;
using executorch::runtime::MemoryManager;
using executorch::runtime::MethodMeta;
using executorch::runtime::Program;
using executorch::runtime::register_device_allocator;
using executorch::runtime::Result;
using executorch::runtime::Span;
using executorch::runtime::deserialization::parseTensor;
using executorch::runtime::etensor::DeviceIndex;
using executorch::runtime::etensor::DeviceType;
using executorch::runtime::testing::ManagedMemoryManager;
using torch::executor::util::FileDataLoader;

Expand All @@ -50,15 +64,77 @@ class ProgramTestFriend final {

using executorch::runtime::testing::ProgramTestFriend;

namespace {

/**
* Mock CUDA allocator that uses host memory for testing.
* Tracks the allocated range so tests can verify tensor data_ptr
* falls within the "device" memory region.
*/
class MockCudaAllocator : public DeviceAllocator {
public:
Result<void*> allocate(size_t nbytes, DeviceIndex index) override {
allocate_count_++;
buffer_ = std::make_unique<uint8_t[]>(nbytes);
buffer_size_ = nbytes;
return static_cast<void*>(buffer_.get());
}

void deallocate(void* ptr, DeviceIndex index) override {
deallocate_count_++;
buffer_.reset();
buffer_size_ = 0;
}

Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
return Error::Ok;
}

Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
return Error::Ok;
}

DeviceType device_type() const override {
return DeviceType::CUDA;
}

bool is_device_ptr(const void* ptr) const {
if (buffer_ == nullptr || buffer_size_ == 0) {
return false;
}
auto* p = static_cast<const uint8_t*>(ptr);
return p >= buffer_.get() && p < buffer_.get() + buffer_size_;
}

int allocate_count_ = 0;
int deallocate_count_ = 0;

private:
std::unique_ptr<uint8_t[]> buffer_;
size_t buffer_size_ = 0;
};

} // namespace

static MockCudaAllocator g_mock_cuda;

class TensorParserDeviceTest : public ::testing::Test {
protected:
static void SetUpTestSuite() {
executorch::runtime::runtime_init();
register_device_allocator(DeviceType::CUDA, &g_mock_cuda);
}

void SetUp() override {
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
ASSERT_NE(path, nullptr)
<< "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
Result<FileDataLoader> loader = FileDataLoader::from(path);
ASSERT_EQ(loader.error(), Error::Ok);
loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));

g_mock_cuda.allocate_count_ = 0;
g_mock_cuda.deallocate_count_ = 0;
}

std::unique_ptr<FileDataLoader> loader_;
Expand Down Expand Up @@ -167,3 +243,96 @@ TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
<< " without device annotation should have device_index=0";
}
}
TEST_F(TensorParserDeviceTest, CudaTensorDataPtrPointsToDeviceMemory) {
Result<Program> program =
Program::load(loader_.get(), Program::Verification::Minimal);
ASSERT_EQ(program.error(), Error::Ok);

Result<MethodMeta> method_meta = program->method_meta("forward");
ASSERT_EQ(method_meta.error(), Error::Ok);

// ModuleAddWithDevice has:
// non_const_buffer_sizes: [0, 48] (index 0 reserved, buffer 0 = 48 bytes)
// non_const_buffer_device: [{buffer_idx=1, device_type=CUDA}]
const size_t num_buffers = method_meta->num_memory_planned_buffers();
ASSERT_EQ(num_buffers, 1);

// Set up device-aware planned memory.
std::vector<Span<uint8_t>> planned_spans;
std::vector<std::vector<uint8_t>> cpu_buffers;
std::vector<DeviceMemoryBuffer> device_buffers;

for (size_t i = 0; i < num_buffers; ++i) {
auto size = method_meta->memory_planned_buffer_size(i);
ASSERT_TRUE(size.ok());
auto device = method_meta->memory_planned_buffer_device(i);
ASSERT_TRUE(device.ok());

if (device->is_cpu()) {
cpu_buffers.emplace_back(size.get());
planned_spans.emplace_back(
cpu_buffers.back().data(), cpu_buffers.back().size());
} else {
cpu_buffers.emplace_back(); // empty placeholder
auto dmb = DeviceMemoryBuffer::create(
size.get(), device->type(), device->index());
ASSERT_TRUE(dmb.ok())
<< "DeviceMemoryBuffer::create failed for buffer " << i;
planned_spans.emplace_back(dmb->as_span());
device_buffers.push_back(std::move(dmb.get()));
}
}

ASSERT_EQ(g_mock_cuda.allocate_count_, 1);

// Build HierarchicalAllocator with mixed CPU/device spans.
HierarchicalAllocator planned_memory(
{planned_spans.data(), planned_spans.size()});

constexpr size_t kMethodAllocBytes = 32 * 1024U;
auto method_alloc_pool = std::make_unique<uint8_t[]>(kMethodAllocBytes);
MemoryAllocator method_allocator(kMethodAllocBytes, method_alloc_pool.get());
MemoryManager memory_manager(&method_allocator, &planned_memory);

// Parse tensors and verify CUDA tensors have device memory.
const executorch_flatbuffer::Program* internal_program =
ProgramTestFriend::GetInternalProgram(&program.get());
auto* execution_plan =
internal_program->execution_plan()->GetMutableObject(0);
auto* flatbuffer_values = execution_plan->values();

int cuda_with_device_memory = 0;

for (size_t i = 0; i < flatbuffer_values->size(); ++i) {
auto* serialization_value = flatbuffer_values->Get(i);
if (serialization_value->val_type() !=
executorch_flatbuffer::KernelTypes::Tensor) {
continue;
}

auto* s_tensor = serialization_value->val_as_Tensor();
bool is_cuda = s_tensor->extra_tensor_info() != nullptr &&
s_tensor->extra_tensor_info()->device_type() ==
executorch_flatbuffer::DeviceType::CUDA;

Result<Tensor> tensor =
parseTensor(&program.get(), &memory_manager, s_tensor);
ASSERT_TRUE(tensor.ok())
<< "parseTensor failed at index " << i << " with error 0x" << std::hex
<< static_cast<uint32_t>(tensor.error());

Tensor t = tensor.get();

if (is_cuda && t.unsafeGetTensorImpl()->device_type() == DeviceType::CUDA) {
EXPECT_TRUE(g_mock_cuda.is_device_ptr(t.const_data_ptr()))
<< "CUDA tensor at index " << i
<< " should have data_ptr in device memory, but got CPU memory";
cuda_with_device_memory++;
}
}

// All 3 CUDA tensors (2 inputs + 1 output of the delegate) should have
// their data_ptr pointing to the mock device memory buffer.
EXPECT_EQ(cuda_with_device_memory, 3)
<< "All 3 CUDA tensors should have data_ptr in device memory";
}
Loading