diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h index 73c4d32d955..a3d17a89903 100644 --- a/backends/aoti/slim/core/storage.h +++ b/backends/aoti/slim/core/storage.h @@ -13,6 +13,7 @@ #ifdef CUDA_AVAILABLE #include #include +#include #endif #include @@ -107,9 +108,6 @@ struct DeviceTraits { /// @param device The target CUDA device (used to get the stream). /// @return Pointer to allocated device memory. static void* allocate(size_t nbytes, const c10::Device& device) { - // Get the current stream for this device (set by CUDAStreamGuard if any) - // This follows PyTorch's pattern where the allocator assumes the caller - // has already set the correct device via CUDAStreamGuard. auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(device.index()); ET_CHECK_MSG( @@ -118,31 +116,23 @@ struct DeviceTraits { static_cast(device.index())); cudaStream_t stream = stream_result.get(); - void* data = nullptr; - ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream)); - return data; + auto result = executorch::backends::cuda::CudaAllocator::allocate_async( + nbytes, device.index(), stream); + ET_CHECK_MSG( + result.ok(), + "CudaAllocator::allocate_async failed for %zu bytes on device %d", + nbytes, + static_cast(device.index())); + return result.get(); } - /// Frees CUDA device memory on the current stream. - /// @param ptr Pointer to device memory to free. static void free(void* ptr) { - // Get the current stream for the current device - // Currently all cuda slimtensors should be on the same device same stream, - // so we can just use the stream on current device. - // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to - // support multiple devices. auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1); ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream"); - ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get())); + executorch::backends::cuda::CudaAllocator::deallocate_async( + ptr, -1, stream_result.get()); } - /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously. - /// @param dst Destination pointer. - /// @param src Source pointer. - /// @param nbytes Number of bytes to copy. - /// @param dst_device Destination device. - /// @param src_device Source device. - /// @param stream CUDA stream for async copy. static void memcpy_async( void* dst, const void* src, @@ -151,7 +141,6 @@ struct DeviceTraits { const c10::Device& src_device, cudaStream_t stream) { cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; - if (src_device.is_cpu()) { direction = cudaMemcpyHostToDevice; } else if (dst_device.is_cpu()) { @@ -164,15 +153,11 @@ struct DeviceTraits { static_cast(dst_device.index())); } - ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream)); + auto err = executorch::backends::cuda::CudaAllocator::memcpy_async( + dst, src, nbytes, direction, stream); + ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed"); } - /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously. - /// @param dst Destination pointer. - /// @param src Source pointer. - /// @param nbytes Number of bytes to copy. - /// @param dst_device Destination device. - /// @param src_device Source device. static void memcpy( void* dst, const void* src, @@ -180,7 +165,6 @@ struct DeviceTraits { const c10::Device& dst_device, const c10::Device& src_device) { cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; - if (src_device.is_cpu()) { direction = cudaMemcpyHostToDevice; } else if (dst_device.is_cpu()) { diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl index b9148305c91..42a7b79da6e 100644 --- a/backends/aoti/slim/core/targets.bzl +++ b/backends/aoti/slim/core/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(): "//executorch/runtime/platform:platform", "//executorch/backends/aoti/slim/c10/cuda:exception", "//executorch/backends/aoti/slim/cuda:guard", + "//executorch/backends/cuda/runtime:cuda_allocator", ], ) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index 9c07b732735..8d0710aaadc 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -70,6 +70,33 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "cuda_allocator", + srcs = [ + "cuda_allocator.cpp", + ], + headers = [ + "cuda_allocator.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["PUBLIC"], + exported_deps = [ + "//executorch/runtime/core:device_allocator", + ], + deps = [ + "//executorch/runtime/platform:platform", + ], + nvcc_flags = get_nvcc_arch_args() + [ + "-_NVCC_HOST_COMPILER_FLAG_", + "gcc", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) + runtime.cxx_library( name = "cuda_backend", srcs = [ @@ -87,6 +114,8 @@ runtime.cxx_library( visibility = ["PUBLIC"], deps = [ ":runtime_shims", + ":cuda_allocator", + ":cuda_platform", "//executorch/backends/aoti:aoti_common_slim", "//executorch/backends/aoti/slim/core:slimtensor", "//executorch/backends/aoti/slim/factory:empty", diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp new file mode 100644 index 00000000000..1e0d0c29dfd --- /dev/null +++ b/backends/cuda/runtime/cuda_allocator.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace executorch::backends::cuda { + +using executorch::runtime::Error; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +Result CudaAllocator::allocate(size_t nbytes, DeviceIndex index) { + void* ptr = nullptr; + cudaError_t prev_device_err = cudaSuccess; + int prev_device = 0; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaMalloc(&ptr, nbytes); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMalloc failed: %s (requested %zu bytes on device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::MemoryAllocationFailed; + } + + return ptr; +} + +void CudaAllocator::deallocate(void* ptr, DeviceIndex index) { + if (ptr == nullptr) { + return; + } + + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaFree(ptr); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaFree failed: %s (ptr=%p, device %d)", + cudaGetErrorString(err), + ptr, + static_cast(index)); + } +} + +Error CudaAllocator::copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) { + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy H2D failed: %s (%zu bytes, device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::Internal; + } + return Error::Ok; +} + +Error CudaAllocator::copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) { + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy D2H failed: %s (%zu bytes, device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::Internal; + } + return Error::Ok; +} + +DeviceType CudaAllocator::device_type() const { + return DeviceType::CUDA; +} + +CudaAllocator& CudaAllocator::instance() { + static CudaAllocator allocator; + return allocator; +} + +Result CudaAllocator::allocate_async( + size_t nbytes, + DeviceIndex index, + cudaStream_t stream) { + void* ptr = nullptr; + cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMallocAsync failed: %s (requested %zu bytes on device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::MemoryAllocationFailed; + } + return ptr; +} + +void CudaAllocator::deallocate_async( + void* ptr, + DeviceIndex index, + cudaStream_t stream) { + if (ptr == nullptr) { + return; + } + cudaError_t err = cudaFreeAsync(ptr, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaFreeAsync failed: %s (ptr=%p, device %d)", + cudaGetErrorString(err), + ptr, + static_cast(index)); + } +} + +Error CudaAllocator::memcpy_async( + void* dst, + const void* src, + size_t nbytes, + cudaMemcpyKind direction, + cudaStream_t stream) { + cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpyAsync failed: %s (%zu bytes)", + cudaGetErrorString(err), + nbytes); + return Error::Internal; + } + return Error::Ok; +} + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_allocator.h b/backends/cuda/runtime/cuda_allocator.h new file mode 100644 index 00000000000..3d2674a0e01 --- /dev/null +++ b/backends/cuda/runtime/cuda_allocator.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace executorch::backends::cuda { + +/** + * CUDA implementation of DeviceAllocator. + * + * Uses cudaMalloc/cudaFree for allocation and cudaMemcpy for host-device + * transfers. This allocator is automatically registered as a singleton + * with the DeviceAllocatorRegistry when the CUDA backend library is linked. + * + * All CUDA memory operations in the CUDA backend should go through this + * allocator for consistent memory management. + */ +class CudaAllocator final : public executorch::runtime::DeviceAllocator { + public: + executorch::runtime::Result allocate( + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index) override; + + void deallocate( + void* ptr, + executorch::runtime::etensor::DeviceIndex index) override; + + executorch::runtime::Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index) override; + + executorch::runtime::Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index) override; + + executorch::runtime::etensor::DeviceType device_type() const override; + + /// Returns the global CudaAllocator singleton. + static CudaAllocator& instance(); + + // --- Async (stream-based) operations for SlimTensor/Storage layer --- + + /** + * Allocate device memory asynchronously on the given CUDA stream. + */ + static executorch::runtime::Result allocate_async( + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index, + cudaStream_t stream); + + /** + * Deallocate device memory asynchronously on the given CUDA stream. + */ + static void deallocate_async( + void* ptr, + executorch::runtime::etensor::DeviceIndex index, + cudaStream_t stream); + + /** + * Copy memory asynchronously on the given CUDA stream. + * Supports H2D, D2H, and D2D based on src/dst device types. + */ + static executorch::runtime::Error memcpy_async( + void* dst, + const void* src, + size_t nbytes, + cudaMemcpyKind direction, + cudaStream_t stream); +}; + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 39071f731a1..1a62c35bbb4 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -38,6 +38,7 @@ // Include our shim layer headers #include #include +#include #include #include #include @@ -637,5 +638,14 @@ auto cls = cuda::CudaBackend(); executorch::runtime::Backend backend{"CudaBackend", &cls}; static executorch::runtime::Error success_with_compiler = register_backend(backend); + +// Auto-register the CudaAllocator so that DeviceMemoryBuffer::create(CUDA) +// works whenever the CUDA backend library is linked. +static bool cuda_allocator_registered = [] { + executorch::runtime::register_device_allocator( + executorch::runtime::etensor::DeviceType::CUDA, + &cuda::CudaAllocator::instance()); + return true; +}(); } // namespace } // namespace executorch::backends