diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
index 73c4d32d955..a3d17a89903 100644
--- a/backends/aoti/slim/core/storage.h
+++ b/backends/aoti/slim/core/storage.h
@@ -13,6 +13,7 @@
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
 #include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
@@ -107,9 +108,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param device The target CUDA device (used to get the stream).
   /// @return Pointer to allocated device memory.
   static void* allocate(size_t nbytes, const c10::Device& device) {
-    // Get the current stream for this device (set by CUDAStreamGuard if any)
-    // This follows PyTorch's pattern where the allocator assumes the caller
-    // has already set the correct device via CUDAStreamGuard.
     auto stream_result =
         executorch::backends::cuda::getCurrentCUDAStream(device.index());
     ET_CHECK_MSG(
@@ -118,31 +116,23 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
         static_cast<int>(device.index()));
 
     cudaStream_t stream = stream_result.get();
-    void* data = nullptr;
-    ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
-    return data;
+    auto result = executorch::backends::cuda::CudaAllocator::allocate_async(
+        nbytes, device.index(), stream);
+    ET_CHECK_MSG(
+        result.ok(),
+        "CudaAllocator::allocate_async failed for %zu bytes on device %d",
+        nbytes,
+        static_cast<int>(device.index()));
+    return result.get();
   }
 
-  /// Frees CUDA device memory on the current stream.
-  /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
-    // Get the current stream for the current device
-    // Currently all cuda slimtensors should be on the same device same stream,
-    // so we can just use the stream on current device.
-    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
-    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
     ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
-    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    executorch::backends::cuda::CudaAllocator::deallocate_async(
+        ptr, -1, stream_result.get());
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
-  /// @param stream CUDA stream for async copy.
   static void memcpy_async(
       void* dst,
       const void* src,
@@ -151,7 +141,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& src_device,
       cudaStream_t stream) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
@@ -164,15 +153,11 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
           static_cast<int>(dst_device.index()));
     }
 
-    ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
+    auto err = executorch::backends::cuda::CudaAllocator::memcpy_async(
+        dst, src, nbytes, direction, stream);
+    ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed");
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
-  /// @param dst Destination pointer.
-  /// @param src Source pointer.
-  /// @param nbytes Number of bytes to copy.
-  /// @param dst_device Destination device.
-  /// @param src_device Source device.
   static void memcpy(
       void* dst,
       const void* src,
@@ -180,7 +165,6 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
       const c10::Device& dst_device,
       const c10::Device& src_device) {
     cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
-
     if (src_device.is_cpu()) {
       direction = cudaMemcpyHostToDevice;
     } else if (dst_device.is_cpu()) {
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index b9148305c91..42a7b79da6e 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
             "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/backends/cuda/runtime:cuda_allocator",
         ],
     )
 
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 9c07b732735..8d0710aaadc 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -70,6 +70,33 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "cuda_allocator",
+    srcs = [
+        "cuda_allocator.cpp",
+    ],
+    headers = [
+        "cuda_allocator.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["PUBLIC"],
+    exported_deps = [
+        "//executorch/runtime/core:device_allocator",
+    ],
+    deps = [
+        "//executorch/runtime/platform:platform",
+    ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
@@ -87,6 +114,8 @@ runtime.cxx_library(
     visibility = ["PUBLIC"],
     deps = [
         ":runtime_shims",
+        ":cuda_allocator",
+        ":cuda_platform",
         "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp
new file mode 100644
index 00000000000..1e0d0c29dfd
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
+
+#include <cuda_runtime.h>
+
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::DeviceIndex;
+using executorch::runtime::etensor::DeviceType;
+
+Result<void*> CudaAllocator::allocate(size_t nbytes, DeviceIndex index) {
+  void* ptr = nullptr;
+  cudaError_t prev_device_err = cudaSuccess;
+  int prev_device = 0;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMalloc(&ptr, nbytes);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMalloc failed: %s (requested %zu bytes on device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::MemoryAllocationFailed;
+  }
+
+  return ptr;
+}
+
+void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
+  if (ptr == nullptr) {
+    return;
+  }
+
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaFree(ptr);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaFree failed: %s (ptr=%p, device %d)",
+        cudaGetErrorString(err),
+        ptr,
+        static_cast<int>(index));
+  }
+}
+
+Error CudaAllocator::copy_host_to_device(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    DeviceIndex index) {
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpy H2D failed: %s (%zu bytes, device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+Error CudaAllocator::copy_device_to_host(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    DeviceIndex index) {
+  int prev_device = 0;
+  cudaError_t prev_device_err = cudaSuccess;
+
+  if (index >= 0) {
+    prev_device_err = cudaGetDevice(&prev_device);
+    if (prev_device_err == cudaSuccess) {
+      cudaSetDevice(index);
+    }
+  }
+
+  cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
+
+  if (index >= 0 && prev_device_err == cudaSuccess) {
+    cudaSetDevice(prev_device);
+  }
+
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpy D2H failed: %s (%zu bytes, device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+DeviceType CudaAllocator::device_type() const {
+  return DeviceType::CUDA;
+}
+
+CudaAllocator& CudaAllocator::instance() {
+  static CudaAllocator allocator;
+  return allocator;
+}
+
+Result<void*> CudaAllocator::allocate_async(
+    size_t nbytes,
+    DeviceIndex index,
+    cudaStream_t stream) {
+  void* ptr = nullptr;
+  cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMallocAsync failed: %s (requested %zu bytes on device %d)",
+        cudaGetErrorString(err),
+        nbytes,
+        static_cast<int>(index));
+    return Error::MemoryAllocationFailed;
+  }
+  return ptr;
+}
+
+void CudaAllocator::deallocate_async(
+    void* ptr,
+    DeviceIndex index,
+    cudaStream_t stream) {
+  if (ptr == nullptr) {
+    return;
+  }
+  cudaError_t err = cudaFreeAsync(ptr, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaFreeAsync failed: %s (ptr=%p, device %d)",
+        cudaGetErrorString(err),
+        ptr,
+        static_cast<int>(index));
+  }
+}
+
+Error CudaAllocator::memcpy_async(
+    void* dst,
+    const void* src,
+    size_t nbytes,
+    cudaMemcpyKind direction,
+    cudaStream_t stream) {
+  cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream);
+  if (err != cudaSuccess) {
+    ET_LOG(
+        Error,
+        "cudaMemcpyAsync failed: %s (%zu bytes)",
+        cudaGetErrorString(err),
+        nbytes);
+    return Error::Internal;
+  }
+  return Error::Ok;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_allocator.h b/backends/cuda/runtime/cuda_allocator.h
new file mode 100644
index 00000000000..3d2674a0e01
--- /dev/null
+++ b/backends/cuda/runtime/cuda_allocator.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <executorch/runtime/core/device_allocator.h>
+
+namespace executorch::backends::cuda {
+
+/**
+ * CUDA implementation of DeviceAllocator.
+ *
+ * Uses cudaMalloc/cudaFree for allocation and cudaMemcpy for host-device
+ * transfers. This allocator is automatically registered as a singleton
+ * with the DeviceAllocatorRegistry when the CUDA backend library is linked.
+ *
+ * All CUDA memory operations in the CUDA backend should go through this
+ * allocator for consistent memory management.
+ */
+class CudaAllocator final : public executorch::runtime::DeviceAllocator {
+ public:
+  executorch::runtime::Result<void*> allocate(
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  void deallocate(
+      void* ptr,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::Error copy_host_to_device(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::Error copy_device_to_host(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index) override;
+
+  executorch::runtime::etensor::DeviceType device_type() const override;
+
+  /// Returns the global CudaAllocator singleton.
+  static CudaAllocator& instance();
+
+  // --- Async (stream-based) operations for SlimTensor/Storage layer ---
+
+  /**
+   * Allocate device memory asynchronously on the given CUDA stream.
+   */
+  static executorch::runtime::Result<void*> allocate_async(
+      size_t nbytes,
+      executorch::runtime::etensor::DeviceIndex index,
+      cudaStream_t stream);
+
+  /**
+   * Deallocate device memory asynchronously on the given CUDA stream.
+   */
+  static void deallocate_async(
+      void* ptr,
+      executorch::runtime::etensor::DeviceIndex index,
+      cudaStream_t stream);
+
+  /**
+   * Copy memory asynchronously on the given CUDA stream.
+   * Supports H2D, D2H, and D2D based on src/dst device types.
+   */
+  static executorch::runtime::Error memcpy_async(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      cudaMemcpyKind direction,
+      cudaStream_t stream);
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 39071f731a1..1a62c35bbb4 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -38,6 +38,7 @@
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
+#include <executorch/backends/cuda/runtime/cuda_allocator.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -637,5 +638,14 @@ auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
+
+// Auto-register the CudaAllocator so that DeviceMemoryBuffer::create(CUDA)
+// works whenever the CUDA backend library is linked.
+static bool cuda_allocator_registered = [] {
+  executorch::runtime::register_device_allocator(
+      executorch::runtime::etensor::DeviceType::CUDA,
+      &cuda::CudaAllocator::instance());
+  return true;
+}();
 } // namespace
 } // namespace executorch::backends