Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 14 additions & 30 deletions backends/aoti/slim/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifdef CUDA_AVAILABLE
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
#include <executorch/backends/aoti/slim/cuda/guard.h>
#include <executorch/backends/cuda/runtime/cuda_allocator.h>
#endif

#include <executorch/backends/aoti/slim/c10/core/Device.h>
Expand Down Expand Up @@ -107,9 +108,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
/// @param device The target CUDA device (used to get the stream).
/// @return Pointer to allocated device memory.
static void* allocate(size_t nbytes, const c10::Device& device) {
// Get the current stream for this device (set by CUDAStreamGuard if any)
// This follows PyTorch's pattern where the allocator assumes the caller
// has already set the correct device via CUDAStreamGuard.
auto stream_result =
executorch::backends::cuda::getCurrentCUDAStream(device.index());
ET_CHECK_MSG(
Expand All @@ -118,31 +116,23 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
static_cast<int>(device.index()));

cudaStream_t stream = stream_result.get();
void* data = nullptr;
ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
return data;
auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
nbytes, device.index(), stream);
ET_CHECK_MSG(
result.ok(),
"CudaAllocator::allocate_async failed for %zu bytes on device %d",
nbytes,
static_cast<int>(device.index()));
return result.get();
}

/// Frees CUDA device memory on the current stream.
/// @param ptr Pointer to device memory to free.
static void free(void* ptr) {
// Get the current stream for the current device
// Currently all cuda slimtensors should be on the same device same stream,
// so we can just use the stream on current device.
// TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
// support multiple devices.
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
executorch::backends::cuda::CudaAllocator::deallocate_async(
ptr, -1, stream_result.get());
}

/// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
/// @param dst_device Destination device.
/// @param src_device Source device.
/// @param stream CUDA stream for async copy.
static void memcpy_async(
void* dst,
const void* src,
Expand All @@ -151,7 +141,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
const c10::Device& src_device,
cudaStream_t stream) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;

if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
Expand All @@ -164,23 +153,18 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
static_cast<int>(dst_device.index()));
}

ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
dst, src, nbytes, direction, stream);
ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
}

/// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
/// @param dst_device Destination device.
/// @param src_device Source device.
static void memcpy(
void* dst,
const void* src,
size_t nbytes,
const c10::Device& dst_device,
const c10::Device& src_device) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;

if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
Expand Down
1 change: 1 addition & 0 deletions backends/aoti/slim/core/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def define_common_targets():
"//executorch/runtime/platform:platform",
"//executorch/backends/aoti/slim/c10/cuda:exception",
"//executorch/backends/aoti/slim/cuda:guard",
"//executorch/backends/cuda/runtime:cuda_allocator",
],
)

Expand Down
29 changes: 29 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,33 @@ runtime.cxx_library(
],
)

runtime.cxx_library(
name = "cuda_allocator",
srcs = [
"cuda_allocator.cpp",
],
headers = [
"cuda_allocator.h",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
visibility = ["PUBLIC"],
exported_deps = [
"//executorch/runtime/core:device_allocator",
],
deps = [
"//executorch/runtime/platform:platform",
],
nvcc_flags = get_nvcc_arch_args() + [
"-_NVCC_HOST_COMPILER_FLAG_",
"gcc",
],
external_deps = [
("cuda", None, "cuda-lazy"),
],
)

runtime.cxx_library(
name = "cuda_backend",
srcs = [
Expand All @@ -88,6 +115,8 @@ runtime.cxx_library(
deps = [
":cuda_platform",
":runtime_shims",
":cuda_allocator",
":cuda_platform",
"//executorch/backends/aoti:aoti_common_slim",
"//executorch/backends/aoti/slim/core:slimtensor",
"//executorch/backends/aoti/slim/factory:empty",
Expand Down
213 changes: 213 additions & 0 deletions backends/cuda/runtime/cuda_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cuda/runtime/cuda_allocator.h>

#include <cuda_runtime.h>

#include <executorch/runtime/platform/log.h>

namespace executorch::backends::cuda {

using executorch::runtime::Error;
using executorch::runtime::Result;
using executorch::runtime::etensor::DeviceIndex;
using executorch::runtime::etensor::DeviceType;

Result<void*> CudaAllocator::allocate(size_t nbytes, DeviceIndex index) {
void* ptr = nullptr;
cudaError_t prev_device_err = cudaSuccess;
int prev_device = 0;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaMalloc(&ptr, nbytes);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMalloc failed: %s (requested %zu bytes on device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::MemoryAllocationFailed;
}

return ptr;
}

void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
if (ptr == nullptr) {
return;
}

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaFree(ptr);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaFree failed: %s (ptr=%p, device %d)",
cudaGetErrorString(err),
ptr,
static_cast<int>(index));
}
}

Error CudaAllocator::copy_host_to_device(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::Internal;
}
return Error::Ok;
}

Error CudaAllocator::copy_device_to_host(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

if (index >= 0) {
prev_device_err = cudaGetDevice(&prev_device);
if (prev_device_err == cudaSuccess) {
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
}

if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::Internal;
}
return Error::Ok;
}

DeviceType CudaAllocator::device_type() const {
return DeviceType::CUDA;
}

CudaAllocator& CudaAllocator::instance() {
static CudaAllocator allocator;
return allocator;
}

Result<void*> CudaAllocator::allocate_async(
size_t nbytes,
DeviceIndex index,
cudaStream_t stream) {
void* ptr = nullptr;
cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream);
if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMallocAsync failed: %s (requested %zu bytes on device %d)",
cudaGetErrorString(err),
nbytes,
static_cast<int>(index));
return Error::MemoryAllocationFailed;
}
return ptr;
}

void CudaAllocator::deallocate_async(
void* ptr,
DeviceIndex index,
cudaStream_t stream) {
if (ptr == nullptr) {
return;
}
cudaError_t err = cudaFreeAsync(ptr, stream);
if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaFreeAsync failed: %s (ptr=%p, device %d)",
cudaGetErrorString(err),
ptr,
static_cast<int>(index));
}
}

Error CudaAllocator::memcpy_async(
void* dst,
const void* src,
size_t nbytes,
cudaMemcpyKind direction,
cudaStream_t stream) {
cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream);
if (err != cudaSuccess) {
ET_LOG(
Error,
"cudaMemcpyAsync failed: %s (%zu bytes)",
cudaGetErrorString(err),
nbytes);
return Error::Internal;
}
return Error::Ok;
}

} // namespace executorch::backends::cuda
Loading
Loading