diff --git a/backends/aoti/slim/c10/cuda/Exception.h b/backends/aoti/slim/c10/cuda/Exception.h
index 33d8414e661..4d2ba13d3bb 100644
--- a/backends/aoti/slim/c10/cuda/Exception.h
+++ b/backends/aoti/slim/c10/cuda/Exception.h
@@ -8,26 +8,55 @@
 
 #pragma once
 
-#ifdef CUDA_AVAILABLE
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 #include <executorch/backends/aoti/slim/c10/macros/Macros.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
 
 /// Checks a CUDA expression and aborts on error.
 /// @param EXPR The CUDA expression to check.
-#define ET_CUDA_CHECK(EXPR)                                                 \
-  do {                                                                      \
-    const cudaError_t __err = EXPR;                                         \
-    ET_CHECK_MSG(                                                           \
-        __err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
+#ifndef ET_CUDA_CHECK
+#define ET_CUDA_CHECK(EXPR)                                           \
+  do {                                                                \
+    const cudaError_t __err = EXPR;                                   \
+    if (__err == cudaSuccess) {                                       \
+      break;                                                          \
+    }                                                                 \
+    ET_LOG(                                                           \
+        Error,                                                        \
+        "%s:%d CUDA error: %s",                                       \
+        __FILE__,                                                     \
+        __LINE__,                                                     \
+        cudaGetErrorString(__err));                                   \
+    ET_CHECK_MSG(false, "CUDA error: %s", cudaGetErrorString(__err)); \
   } while (0)
+#endif
+
+/// Checks a CUDA expression and returns Error::Internal on failure.
+/// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_CHECK_OR_RETURN_ERROR
+#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR)        \
+  do {                                             \
+    const cudaError_t __err = EXPR;                \
+    if (__err == cudaSuccess) {                    \
+      break;                                       \
+    }                                              \
+    ET_LOG(                                        \
+        Error,                                     \
+        "%s:%d CUDA error: %s",                    \
+        __FILE__,                                  \
+        __LINE__,                                  \
+        cudaGetErrorString(__err));                \
+    return ::executorch::runtime::Error::Internal; \
+  } while (0)
+#endif
 
 /// Checks a CUDA expression and logs a warning on error (non-fatal).
 /// @param EXPR The CUDA expression to check.
+#ifndef ET_CUDA_LOG_WARN
 #define ET_CUDA_LOG_WARN(EXPR)                                      \
   do {                                                              \
     const cudaError_t __err = EXPR;                                 \
@@ -36,5 +65,17 @@
       ET_LOG(Error, "CUDA warning: %s", cudaGetErrorString(__err)); \
     }                                                               \
   } while (0)
+#endif
+
+/// Kernel launch check macro (with return) - checks cudaGetLastError after
+/// kernel launch.
+#ifndef ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR
+#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
+#endif
 
-#endif // CUDA_AVAILABLE
+/// Kernel launch check macro (without return) - checks cudaGetLastError after
+/// kernel launch.
+#ifndef ET_CUDA_KERNEL_LAUNCH_CHECK
+#define ET_CUDA_KERNEL_LAUNCH_CHECK() ET_CUDA_CHECK(cudaGetLastError())
+#endif
diff --git a/backends/aoti/slim/core/Storage.h b/backends/aoti/slim/core/Storage.h
index 156556aa9e1..2b48acd65f5 100644
--- a/backends/aoti/slim/core/Storage.h
+++ b/backends/aoti/slim/core/Storage.h
@@ -12,7 +12,7 @@
 
 #ifdef CUDA_AVAILABLE
 #include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
-#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/backends/aoti/slim/cuda/guard.h>
 #endif
 
 #include <executorch/backends/aoti/slim/c10/core/Device.h>
diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl
index 408738edd35..83b81e67131 100644
--- a/backends/aoti/slim/core/targets.bzl
+++ b/backends/aoti/slim/core/targets.bzl
@@ -18,7 +18,7 @@ def define_common_targets():
             "//executorch/backends/aoti/slim/util:size_util",
             "//executorch/runtime/platform:platform",
             "//executorch/backends/aoti/slim/c10/cuda:exception",
-            "//executorch/backends/cuda/runtime:guard",
+            "//executorch/backends/aoti/slim/cuda:guard",
         ],
     )
 
diff --git a/backends/aoti/slim/cuda/TARGETS b/backends/aoti/slim/cuda/TARGETS
new file mode 100644
index 00000000000..08e83a5f3c4
--- /dev/null
+++ b/backends/aoti/slim/cuda/TARGETS
@@ -0,0 +1,6 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/aoti/slim/cuda/guard.cpp b/backends/aoti/slim/cuda/guard.cpp
new file mode 100644
index 00000000000..461f7ea5944
--- /dev/null
+++ b/backends/aoti/slim/cuda/guard.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/runtime/platform/log.h>
+#include <limits>
+#include <unordered_map>
+
+namespace executorch::backends::cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following
+    // ATen
+    int tmp_device = -1;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+    device_index = static_cast<DeviceIndex>(tmp_device);
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following
+    // ATen
+    int tmp_device = -1;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+    device_index = static_cast<DeviceIndex>(tmp_device);
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    // DeviceIndex (int8_t) implicitly widens to int for cudaSetDevice
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          static_cast<int>(original_device_index_),
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  // CUDA API returns int, explicit cast to DeviceIndex (int8_t) following ATen
+  int tmp_device = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&tmp_device));
+
+  original_device_index_ = static_cast<DeviceIndex>(tmp_device);
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    // DeviceIndex (int8_t) implicitly widens to int for cudaSetDevice
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  // Restore the original stream unless this object was moved-from.
+  // After a move, original_stream_ == current_stream_, which indicates
+  // the moved-from object should not restore.
+  // Note: nullptr is a valid stream value (represents the default stream),
+  // so we must restore even if original_stream_ is nullptr.
+  if (original_stream_ != current_stream_) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          static_cast<int>(device_index_));
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(
+        Error,
+        "Failed to get current stream for device %d",
+        static_cast<int>(device_index));
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/aoti/slim/cuda/guard.h b/backends/aoti/slim/cuda/guard.h
new file mode 100644
index 00000000000..57c01acf3b2
--- /dev/null
+++ b/backends/aoti/slim/cuda/guard.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+// Use DeviceIndex (int8_t) from slim c10 to match ATen's convention.
+// CUDA APIs use int, but implicit widening (int8_t -> int) handles outbound
+// calls, and explicit static_cast handles inbound conversions from CUDA APIs.
+using executorch::backends::aoti::slim::c10::DeviceIndex;
+
+/**
+ * Set the current CUDA stream for the specified device.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index (-1 to use current device)
+ * @return Error code indicating success or failure
+ */
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream for the specified device.
+ * If no stream has been set, creates a new stream and sets it as current.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @return Result containing the current stream on success, or an error code on
+ * failure
+ */
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * RAII guard that sets the current CUDA device and restores it on destruction.
+ * This ensures that the device is properly restored even if an exception
+ * occurs.
+ *
+ */
+class CUDAGuard {
+ private:
+  /**
+   * Private constructor - use create() factory method instead.
+   */
+  explicit CUDAGuard()
+      : original_device_index_(-1), current_device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAGuard.
+   *
+   * @param device_index The device index to set as current
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAGuard> create(DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAGuard(CUDAGuard&& other) noexcept;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /**
+   * Destructor that restores the original device if necessary.
+   */
+  ~CUDAGuard();
+
+  /**
+   * Sets the CUDA device to the given device index.
+   *
+   * @param device_index The device index to set as current
+   * @return Error code indicating success or failure
+   */
+  Error set_index(DeviceIndex device_index);
+
+  /**
+   * Get the original device index before the guard was created.
+   *
+   * @return The original device index
+   */
+  DeviceIndex original_device() const {
+    return original_device_index_;
+  }
+
+  /**
+   * Get the current device index.
+   *
+   * @return The current device index
+   */
+  DeviceIndex current_device() const {
+    return current_device_index_;
+  }
+
+ private:
+  /// The original device before this guard was created
+  DeviceIndex original_device_index_;
+  /// The current device managed by this guard
+  DeviceIndex current_device_index_;
+};
+
+/**
+ * RAII guard that sets the current CUDA device and stream, restoring both on
+ * destruction. This is useful for temporarily switching to a different device
+ * and stream.
+ *
+ */
+class CUDAStreamGuard {
+ private:
+  // Private constructor that takes a CUDAGuard
+  explicit CUDAStreamGuard(CUDAGuard&& guard)
+      : device_guard_(std::move(guard)),
+        original_stream_(nullptr),
+        current_stream_(nullptr),
+        device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAStreamGuard.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAStreamGuard> create(
+      cudaStream_t stream,
+      DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAStreamGuard(CUDAStreamGuard&& other) noexcept;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete;
+
+  /**
+   * Destructor that restores the original stream and device.
+   */
+  ~CUDAStreamGuard();
+
+  /**
+   * Sets the CUDA stream to the given stream on the specified device.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Error code indicating success or failure
+   */
+  Error set_stream(cudaStream_t stream, DeviceIndex device_index);
+
+  /**
+   * Get the current guarded stream.
+   *
+   * @return The current stream
+   */
+  cudaStream_t stream() const {
+    return current_stream_;
+  }
+
+  /**
+   * Get the device index being guarded.
+   *
+   * @return The device index
+   */
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+ private:
+  /// The device guard that handles device switching
+  CUDAGuard device_guard_;
+  /// The original stream that was current before this guard
+  cudaStream_t original_stream_ = nullptr;
+  /// The current stream being guarded
+  cudaStream_t current_stream_ = nullptr;
+  /// The device index for this stream guard
+  DeviceIndex device_index_;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/aoti/slim/cuda/targets.bzl b/backends/aoti/slim/cuda/targets.bzl
new file mode 100644
index 00000000000..cddd69f1999
--- /dev/null
+++ b/backends/aoti/slim/cuda/targets.bzl
@@ -0,0 +1,27 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Define targets for SlimTensor CUDA guard module."""
+
+    runtime.cxx_library(
+        name = "guard",
+        srcs = [
+            "guard.cpp",
+        ],
+        headers = [
+            "guard.h",
+        ],
+        visibility = ["PUBLIC"],
+        deps = [
+            "//executorch/runtime/platform:platform",
+        ],
+        exported_deps = [
+            "//executorch/backends/aoti/slim/c10/core:device",
+            "//executorch/backends/aoti/slim/c10/cuda:exception",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
diff --git a/backends/aoti/slim/cuda/test/TARGETS b/backends/aoti/slim/cuda/test/TARGETS
new file mode 100644
index 00000000000..9ff3e83a8bd
--- /dev/null
+++ b/backends/aoti/slim/cuda/test/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl
new file mode 100644
index 00000000000..bf38b599637
--- /dev/null
+++ b/backends/aoti/slim/cuda/test/targets.bzl
@@ -0,0 +1,32 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
+
+def cuda_slim_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/aoti/slim/cuda:guard",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+        keep_gpu_sections = True,
+        remote_execution = re_test_utils.remote_execution(
+            platform = "gpu-remote-execution",
+        ),
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_slim_cpp_unittest("cuda_guard")
+    cuda_slim_cpp_unittest("cuda_stream_guard")
diff --git a/backends/aoti/slim/cuda/test/test_cuda_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
new file mode 100644
index 00000000000..c9938bf5cd8
--- /dev/null
+++ b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+};
+
+TEST_F(CUDAGuardTest, BasicDeviceSwitching) {
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int device_after_guard;
+    ASSERT_EQ(cudaGetDevice(&device_after_guard), cudaSuccess);
+    EXPECT_EQ(device_after_guard, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), current_device);
+  }
+
+  int device_after_destruction;
+  ASSERT_EQ(cudaGetDevice(&device_after_destruction), cudaSuccess);
+  EXPECT_EQ(device_after_destruction, current_device);
+}
+
+TEST_F(CUDAGuardTest, SameDeviceNoSwitching) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int current_device;
+    ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), 0);
+  }
+
+  int final_device;
+  ASSERT_EQ(cudaGetDevice(&final_device), cudaSuccess);
+  EXPECT_EQ(final_device, 0);
+}
+
+TEST_F(CUDAGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAGuard::create(999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAGuard::create(-2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAGuard>,
+      "CUDAGuard should not be copy constructible");
+}
+
+TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be copy assignable");
+}
+
+TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be move assignable");
+}
diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
new file mode 100644
index 00000000000..613bc6ffe19
--- /dev/null
+++ b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/slim/cuda/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAStreamGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+
+    ASSERT_EQ(cudaStreamCreate(&test_stream1_), cudaSuccess);
+    ASSERT_EQ(cudaStreamCreate(&test_stream2_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (test_stream1_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream1_), cudaSuccess);
+    }
+    if (test_stream2_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream2_), cudaSuccess);
+    }
+
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+  cudaStream_t test_stream1_ = nullptr;
+  cudaStream_t test_stream2_ = nullptr;
+};
+
+TEST_F(CUDAStreamGuardTest, BasicStreamSwitching) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamSwitchingOnSameDevice) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream2_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto new_stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(new_stream_result.ok());
+    EXPECT_EQ(new_stream_result.get(), test_stream2_);
+    EXPECT_EQ(guard.stream(), test_stream2_);
+  }
+
+  auto restored_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(restored_stream_result.ok());
+  EXPECT_EQ(restored_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, NestedStreamGuards) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+
+    {
+      auto guard2_result = CUDAStreamGuard::create(test_stream2_, 0);
+      ASSERT_TRUE(guard2_result.ok());
+      CUDAStreamGuard guard2 = std::move(guard2_result.get());
+
+      auto stream_result2 = getCurrentCUDAStream(0);
+      ASSERT_TRUE(stream_result2.ok());
+      EXPECT_EQ(stream_result2.get(), test_stream2_);
+    }
+
+    auto stream_result3 = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result3.ok());
+    EXPECT_EQ(stream_result3.get(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, SameStreamNoChange) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+    EXPECT_EQ(guard.stream(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamAccessor) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+}
+
+TEST_F(CUDAStreamGuardTest, SetStreamMethod) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+
+  Error err = guard.set_stream(test_stream2_, 0);
+  EXPECT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(guard.stream(), test_stream2_);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream2_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructor) {
+  auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard1_result.ok());
+  CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+  EXPECT_EQ(guard1.stream(), test_stream1_);
+  EXPECT_EQ(guard1.device_index(), 0);
+
+  CUDAStreamGuard guard2 = std::move(guard1);
+
+  EXPECT_EQ(guard2.stream(), test_stream1_);
+  EXPECT_EQ(guard2.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructorRestoresOnlyOnce) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    { CUDAStreamGuard guard2 = std::move(guard1); }
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), initial_stream);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, -2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy constructible");
+}
+
+TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be move assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, NullStreamPointer) {
+  auto guard_result = CUDAStreamGuard::create(nullptr, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), nullptr);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+}
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 024418d31a6..ad5baa8d83f 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -109,6 +109,36 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "runtime_shims_slim",
+    srcs = [
+        "shims/memory_slim.cpp",
+    ],
+    headers = [
+        "shims/memory_slim.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+    deps = [
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:empty",
+        "//executorch/backends/aoti/slim/factory:from_blob",
+        "//executorch/backends/aoti:common_shims",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/platform:platform",
+    ],
+    nvcc_flags = get_nvcc_arch_args() + [
+        "-_NVCC_HOST_COMPILER_FLAG_",
+        "gcc",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
 runtime.cxx_library(
     name = "cuda_backend",
     srcs = [
diff --git a/backends/cuda/runtime/shims/memory_slim.cpp b/backends/cuda/runtime/shims/memory_slim.cpp
new file mode 100644
index 00000000000..500cd41308e
--- /dev/null
+++ b/backends/cuda/runtime/shims/memory_slim.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+
+#include <executorch/backends/aoti/slim/factory/Empty.h>
+#include <executorch/backends/aoti/slim/factory/FromBlob.h>
+#include <executorch/backends/aoti/slim/util/ArrayRefUtil.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace executorch::backends::cuda {
+
+namespace c10 = executorch::backends::aoti::slim::c10;
+using c10::Device;
+using c10::DeviceIndex;
+using c10::DeviceType;
+using c10::ScalarType;
+using executorch::backends::aoti::slim::empty_strided;
+using executorch::backends::aoti::slim::from_blob;
+using executorch::backends::aoti::slim::IntArrayRef;
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size) {
+  // Unused parameters
+  (void)layout;
+  (void)opaque_metadata;
+  (void)opaque_metadata_size;
+
+  ET_CHECK_OR_RETURN_ERROR(
+      data != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2: data is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_new_tensor != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2: ret_new_tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      !(sizes_ptr == nullptr && ndim > 0),
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2: sizes_ptr is null but ndim > 0");
+
+  IntArrayRef sizes(sizes_ptr, static_cast<size_t>(ndim));
+  IntArrayRef strides(strides_ptr, static_cast<size_t>(ndim));
+
+  // Create the SlimTensor using from_blob (non-owning)
+  *ret_new_tensor = new Tensor(from_blob(
+      data,
+      sizes,
+      strides,
+      static_cast<ScalarType>(dtype),
+      Device(
+          static_cast<DeviceType>(device_type),
+          static_cast<DeviceIndex>(device_index)),
+      storage_offset));
+
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory_slim.h b/backends/cuda/runtime/shims/memory_slim.h
new file mode 100644
index 00000000000..7650c4de4b6
--- /dev/null
+++ b/backends/cuda/runtime/shims/memory_slim.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <executorch/backends/aoti/export.h>
+#include <executorch/backends/aoti/slim/core/SlimTensor.h>
+#include <executorch/backends/aoti/slim/core/SlimTensorView-incl.h>
+#include <executorch/runtime/core/error.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using AOTITorchError = Error;
+using Tensor = executorch::backends::aoti::slim::SlimTensor;
+
+extern "C" {
+
+/**
+ * Creates a tensor object from an existing memory blob without copying the
+ * data. The tensor will wrap the provided memory and will not take ownership of
+ * it. When the tensor is deleted, the original memory will remain valid and
+ * must be freed by the caller.
+ *
+ * @param data Pointer to the memory blob to wrap (must not be null)
+ * @param ndim Number of dimensions in the tensor
+ * @param sizes_ptr Pointer to array of dimension sizes
+ * @param strides_ptr Pointer to array of strides for each dimension
+ * @param storage_offset Storage offset in number of elements
+ * @param dtype Data type identifier (matches PyTorch scalar types)
+ * @param device_type Device type (CPU=0, CUDA=1)
+ * @param device_index Device index
+ * @param ret_new_tensor Output parameter for the created tensor
+ * @param layout Tensor layout identifier (0=strided)
+ * @param opaque_metadata Optional metadata pointer (can be null)
+ * @param opaque_metadata_size Size of opaque metadata in bytes
+ * @return AOTITorchError error code (Error::Ok on success)
+ */
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
index 256b375f20d..78f8dea20ce 100644
--- a/backends/cuda/runtime/shims/tests/targets.bzl
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -25,12 +25,40 @@ def cuda_shim_cpp_unittest(name):
         ),
     )
 
+def cuda_shim_slim_cpp_unittest(name):
+    """Unittest for SlimTensor-based shim functions."""
+    cpp_unittest(
+        name = "test_" + name + "_slim",
+        srcs = [
+            "test_" + name + "_slim.cpp",
+        ],
+        deps = [
+            "//executorch/backends/cuda/runtime:runtime_shims_slim",
+            "//executorch/backends/aoti:common_shims",
+            "//executorch/backends/aoti/slim/core:slimtensor",
+            "//executorch/backends/aoti/slim/factory:empty",
+            "//executorch/backends/aoti/slim/factory:from_blob",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+        ],
+
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+        preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
+        keep_gpu_sections = True,
+        remote_execution = re_test_utils.remote_execution(
+            platform = "gpu-remote-execution",
+        ),
+    )
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
+    # Original ETensor-based shim tests, will be removed after migration
     cuda_shim_cpp_unittest("aoti_torch_empty_strided")
     cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object")
     cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
@@ -41,3 +69,6 @@ def define_common_targets():
     cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle")
     cuda_shim_cpp_unittest("aoti_torch_item_bool")
     cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out")
+
+    # SlimTensor-based shim tests
+    cuda_shim_slim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp
new file mode 100644
index 00000000000..21f8c79cc46
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2_slim.cpp
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+#include <executorch/backends/aoti/slim/c10/core/Device.h>
+#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
+#include <executorch/backends/cuda/runtime/shims/memory_slim.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+
+using namespace executorch::backends::cuda;
+using executorch::runtime::Error;
+
+namespace slim_c10 = executorch::backends::aoti::slim::c10;
+
+namespace {
+
+// Helper to check if CUDA is available
+bool isCudaAvailable() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  return (err == cudaSuccess && device_count > 0);
+}
+
+// Helper to calculate contiguous strides from sizes
+std::vector<int64_t> calculateContiguousStrides(
+    const std::vector<int64_t>& sizes) {
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.empty()) {
+    return strides;
+  }
+  strides[sizes.size() - 1] = 1;
+  for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * sizes[i + 1];
+  }
+  return strides;
+}
+
+// Helper to calculate numel from sizes
+int64_t calculateNumel(const std::vector<int64_t>& sizes) {
+  int64_t numel = 1;
+  for (int64_t size : sizes) {
+    numel *= size;
+  }
+  return numel;
+}
+
+} // namespace
+
+// Test fixture for SlimTensor-based aoti_torch_create_tensor_from_blob_v2 tests
+class AOTITorchCreateTensorFromBlobV2SlimTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+  }
+
+  void TearDown() override {
+    // Clean up tensors
+    for (Tensor* t : tensors_) {
+      delete t;
+    }
+    tensors_.clear();
+
+    // Clean up CUDA memory
+    for (void* ptr : cuda_memory_) {
+      if (ptr != nullptr) {
+        cudaFree(ptr);
+      }
+    }
+    cuda_memory_.clear();
+
+    // Clean up CPU memory
+    for (void* ptr : cpu_memory_) {
+      if (ptr != nullptr) {
+        free(ptr);
+      }
+    }
+    cpu_memory_.clear();
+  }
+
+  void* allocateCudaMemory(size_t bytes) {
+    void* ptr = nullptr;
+    cudaError_t err = cudaMalloc(&ptr, bytes);
+    if (err == cudaSuccess && ptr != nullptr) {
+      cuda_memory_.push_back(ptr);
+    }
+    return ptr;
+  }
+
+  void* allocateCpuMemory(size_t bytes) {
+    void* ptr = nullptr;
+    int result = posix_memalign(&ptr, 16, bytes);
+    if (result == 0 && ptr != nullptr) {
+      cpu_memory_.push_back(ptr);
+    }
+    return ptr;
+  }
+
+  void trackTensor(Tensor* t) {
+    if (t != nullptr) {
+      tensors_.push_back(t);
+    }
+  }
+
+ private:
+  std::vector<Tensor*> tensors_;
+  std::vector<void*> cuda_memory_;
+  std::vector<void*> cpu_memory_;
+};
+
+// ============================================================================
+// Common test body - parameterized by device type
+// ============================================================================
+
+void runBasicFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->numel(), 6);
+  EXPECT_EQ(
+      static_cast<int32_t>(tensor->dtype()),
+      static_cast<int32_t>(slim_c10::ScalarType::Float));
+
+  // Verify the tensor uses the same data pointer (non-owning)
+  EXPECT_EQ(tensor->data_ptr(), data);
+
+  // Cleanup - tensor should NOT free the original memory
+  delete tensor;
+}
+
+void runScalarFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {}; // 0D tensor
+  std::vector<int64_t> strides = {};
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_EQ(tensor->dim(), 0);
+  EXPECT_EQ(tensor->numel(), 1);
+  EXPECT_EQ(tensor->data_ptr(), data);
+
+  delete tensor;
+}
+
+void runMultiDimensionalFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->size(2), 4);
+  EXPECT_EQ(tensor->numel(), 24);
+  EXPECT_EQ(tensor->data_ptr(), data);
+
+  delete tensor;
+}
+
+void runCustomStridesFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {1, 3}; // Column-major
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_EQ(tensor->stride(0), 1);
+  EXPECT_EQ(tensor->stride(1), 3);
+  EXPECT_FALSE(tensor->is_contiguous());
+  EXPECT_EQ(tensor->data_ptr(), data);
+
+  delete tensor;
+}
+
+void runStorageOffsetFromBlobTest(
+    AOTITorchCreateTensorFromBlobV2SlimTest* fixture,
+    void* data,
+    int32_t device_type,
+    int32_t device_index) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      2, // storage_offset = 2 elements
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      device_type,
+      device_index,
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_EQ(tensor->storage_offset(), 2);
+  // data_ptr should point to base + offset * itemsize
+  char* expected_ptr = static_cast<char*>(data) + 2 * sizeof(float);
+  EXPECT_EQ(tensor->data_ptr(), expected_ptr);
+
+  delete tensor;
+}
+
+// ============================================================================
+// CPU Tests
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CPU) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runBasicFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CPU) {
+  size_t bytes = sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runScalarFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CPU) {
+  size_t bytes = 24 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runMultiDimensionalFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CPU) {
+  size_t bytes = 12 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runCustomStridesFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CPU) {
+  // Allocate extra space for offset
+  size_t bytes = 6 * sizeof(float); // 2 for offset + 4 for tensor
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runStorageOffsetFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CPU), 0);
+}
+
+// ============================================================================
+// CUDA Tests
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, BasicFunctionality_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runBasicFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, ScalarTensor_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runScalarFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, MultiDimensional_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 24 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runMultiDimensionalFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, CustomStrides_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 12 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runCustomStridesFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, StorageOffset_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  // Allocate extra space for offset
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  runStorageOffsetFromBlobTest(
+      this, data, static_cast<int32_t>(slim_c10::DeviceType::CUDA), 0);
+}
+
+// ============================================================================
+// Verify Non-Owning Behavior
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CPU) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  // Write a pattern
+  float* float_data = static_cast<float*>(data);
+  float_data[0] = 42.0f;
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  // Delete tensor - memory should NOT be freed
+  delete tensor;
+  tensor = nullptr;
+
+  // Memory should still be accessible
+  EXPECT_FLOAT_EQ(float_data[0], 42.0f);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NonOwningBehavior_CUDA) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  // Write a pattern
+  float pattern = 42.0f;
+  cudaMemcpy(data, &pattern, sizeof(float), cudaMemcpyHostToDevice);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  // Delete tensor - memory should NOT be freed
+  delete tensor;
+  tensor = nullptr;
+
+  // Memory should still be accessible
+  float readback = 0.0f;
+  cudaError_t cuda_err =
+      cudaMemcpy(&readback, data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_FLOAT_EQ(readback, 42.0f);
+}
+
+// ============================================================================
+// Error Cases
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullDataPointer) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      nullptr, // null data
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, NullReturnPointer) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      nullptr, // null return pointer
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// ============================================================================
+// Verify Device Properties
+// ============================================================================
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCPUDevice) {
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCpuMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CPU),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_TRUE(tensor->is_cpu());
+  EXPECT_FALSE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CPU);
+
+  delete tensor;
+}
+
+TEST_F(AOTITorchCreateTensorFromBlobV2SlimTest, VerifyCUDADevice) {
+  if (!isCudaAvailable()) {
+    GTEST_SKIP() << "CUDA not available";
+  }
+
+  size_t bytes = 6 * sizeof(float);
+  void* data = allocateCudaMemory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculateContiguousStrides(sizes);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0,
+      static_cast<int32_t>(slim_c10::ScalarType::Float),
+      static_cast<int32_t>(slim_c10::DeviceType::CUDA),
+      0,
+      &tensor,
+      0,
+      nullptr,
+      0);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  EXPECT_FALSE(tensor->is_cpu());
+  EXPECT_TRUE(tensor->is_cuda());
+  EXPECT_EQ(tensor->device_type(), slim_c10::DeviceType::CUDA);
+
+  delete tensor;
+}