diff --git a/runtime/common/callbacks.h b/runtime/common/callbacks.h index 09180aa27e..7573d6a298 100644 --- a/runtime/common/callbacks.h +++ b/runtime/common/callbacks.h @@ -54,6 +54,9 @@ typedef struct { // Copy bytes from device memory to host int (*copy_from_dev) (void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size); + // Copy bytes from device memory to device memory + int (*copy_dev_to_dev) (vx_buffer_h hdest_buffer, uint64_t dest_offset, vx_buffer_h hsrc_buffer, uint64_t src_offset, uint64_t size); + // Start device execution int (*start) (vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments); diff --git a/runtime/common/callbacks.inc b/runtime/common/callbacks.inc index 84a77718ed..bfa8f461f0 100644 --- a/runtime/common/callbacks.inc +++ b/runtime/common/callbacks.inc @@ -169,6 +169,22 @@ extern int vx_dev_init(callbacks_t* callbacks) { return device->download(host_ptr, buffer->addr + src_offset, size); }; + callbacks->copy_dev_to_dev = [](vx_buffer_h hdest_buffer, uint64_t dest_offset, vx_buffer_h hsrc_buffer, uint64_t src_offset, uint64_t size) { + if (nullptr == hdest_buffer || nullptr == hsrc_buffer) + return -1; + auto dest_buffer = ((vx_buffer*)hdest_buffer); + auto src_buffer = ((vx_buffer*)hsrc_buffer); + auto device = ((vx_device*)dest_buffer->device); + if ((dest_offset + size) > dest_buffer->size + || (src_offset + size) > src_buffer->size) + return -1; + DBGPRINT("COPY_DEV_TO_DEV: hdest_buffer=%p, dest_offset=%ld, hsrc_buffer=%p, src_offset=%ld, size=%ld\n", + hdest_buffer, dest_offset, hsrc_buffer, src_offset, size); + return device->copy(dest_buffer->addr + dest_offset, + src_buffer->addr + src_offset, + size); + }; + callbacks->start = [](vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) { if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments) return -1; diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index 6e3bda07d2..7f5da8576f 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -104,6 +104,8 @@ int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offse // Copy bytes from device memory to host int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size); +// Copy bytes from device memory to device memory +int vx_copy_dev_to_dev(vx_buffer_h hdest_buffer, uint64_t dest_offset, vx_buffer_h hsrc_buffer, uint64_t src_offset, uint64_t size); // Start device execution int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments); diff --git a/runtime/opae/driver.cpp b/runtime/opae/driver.cpp index 5048cf7543..33933b85a6 100644 --- a/runtime/opae/driver.cpp +++ b/runtime/opae/driver.cpp @@ -78,6 +78,7 @@ int drv_init(opae_drv_api_t* opae_drv_funcs) { SET_API (fpgaGetIOAddress); SET_API (fpgaWriteMMIO64); SET_API (fpgaReadMMIO64); + SET_API (fpgaCopyBuffer); SET_API (fpgaErrStr); return 0; diff --git a/runtime/opae/driver.h b/runtime/opae/driver.h index 0a45b6f678..c255bc3147 100644 --- a/runtime/opae/driver.h +++ b/runtime/opae/driver.h @@ -34,6 +34,7 @@ typedef fpga_result (*pfn_fpgaReleaseBuffer)(fpga_handle handle, uint64_t wsid); typedef fpga_result (*pfn_fpgaGetIOAddress)(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr); typedef fpga_result (*pfn_fpgaWriteMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value); typedef fpga_result (*pfn_fpgaReadMMIO64)(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value); +typedef fpga_result (*pfn_fpgaCopyBuffer)(fpga_handle handle, uint64_t dest, uint64_t src, uint64_t size); typedef const char *(*pfn_fpgaErrStr)(fpga_result e); struct opae_drv_api_t { @@ -52,6 +53,7 @@ struct opae_drv_api_t { pfn_fpgaGetIOAddress fpgaGetIOAddress; pfn_fpgaWriteMMIO64 fpgaWriteMMIO64; pfn_fpgaReadMMIO64 fpgaReadMMIO64; + pfn_fpgaCopyBuffer fpgaCopyBuffer; pfn_fpgaErrStr fpgaErrStr; }; diff --git a/runtime/opae/vortex.cpp b/runtime/opae/vortex.cpp index 38ee514abf..f288395703 100755 --- a/runtime/opae/vortex.cpp +++ b/runtime/opae/vortex.cpp @@ -288,6 +288,21 @@ class vx_device { return 0; } + int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size){ + if( dest_addr == src_addr) { + return 0; + } + + if (dest_addr + size > global_mem_size_ || + src_addr + size > global_mem_size_) + return -1; + + CHECK_FPGA_ERR(api_.fpgaCopyBuffer(fpga_, dest_addr, src_addr, size), { + return -1; + }); + return 0; + } + int upload(uint64_t dev_addr, const void *host_ptr, uint64_t size) { // check alignment if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE)) diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index ccf61e16f6..f1ccd5fb23 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -184,6 +184,17 @@ class vx_device { return 0; } + int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > GLOBAL_MEM_SIZE || dest_addr + asize > GLOBAL_MEM_SIZE) + return -1; + + ram_.enable_acl(false); + ram_.copy(dest_addr, src_addr, size); + ram_.enable_acl(true); + return 0; + } + int start(uint64_t krnl_addr, uint64_t args_addr) { // ensure prior run completed if (future_.valid()) { diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 7010d4f55e..850a09b9e1 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -311,6 +311,23 @@ class vx_device { return 0; } + int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + if (src_addr + asize > GLOBAL_MEM_SIZE || dest_addr + asize > GLOBAL_MEM_SIZE) + return -1; +#ifdef VM_ENABLE + uint64_t pAddr_src = page_table_walk(src_addr); + uint64_t pAddr_dest = page_table_walk(dest_addr); + DBGPRINT(" [RT:copy] Copy data from vAddr = 0x%lx (pAddr=0x%lx) to vAddr = 0x%lx (pAddr=0x%lx)\n", src_addr, pAddr_src, dest_addr, pAddr_dest); + src_addr = pAddr_src; + dest_addr = pAddr_dest; +#endif + ram_.enable_acl(false); + ram_.copy(dest_addr, src_addr, size); + ram_.enable_acl(true); + return 0; + } + int start(uint64_t krnl_addr, uint64_t args_addr) { // ensure prior run completed if (future_.valid()) { diff --git a/runtime/stub/vortex.cpp b/runtime/stub/vortex.cpp index 70b95dcbc4..15f6ae9225 100644 --- a/runtime/stub/vortex.cpp +++ b/runtime/stub/vortex.cpp @@ -139,6 +139,10 @@ extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_of return (g_callbacks.copy_from_dev)(host_ptr, hbuffer, src_offset, size); } +extern int vx_copy_dev_to_dev(vx_buffer_h hdest_buffer, uint64_t dest_offset, vx_buffer_h hsrc_buffer, uint64_t src_offset, uint64_t size) { + return (g_callbacks.copy_dev_to_dev)(hdest_buffer, dest_offset, hsrc_buffer, src_offset, size); +} + extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) { int profiling_mode = get_profiling_mode(); if (profiling_mode != 0) { diff --git a/runtime/xrt/vortex.cpp b/runtime/xrt/vortex.cpp index 0942c700d5..4ebfb9c1b1 100644 --- a/runtime/xrt/vortex.cpp +++ b/runtime/xrt/vortex.cpp @@ -444,6 +444,70 @@ class vx_device { return 0; } + int copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + if (dest_addr == src_addr) { + return 0; + } + + // bound checking + if (dest_addr + size > global_mem_size_ || + src_addr + size > global_mem_size_) + return -1; + + uint64_t offset = 0; + while (offset < size) { + uint64_t curr_src = src_addr + offset; + uint64_t curr_dest = dest_addr + offset; + + uint64_t src_rem = CACHE_BLOCK_SIZE - (curr_src % CACHE_BLOCK_SIZE); + uint64_t dest_rem = CACHE_BLOCK_SIZE - (curr_dest % CACHE_BLOCK_SIZE); + + uint64_t chunk_size = (src_rem < dest_rem) ? src_rem : dest_rem; + if (chunk_size > size - offset) { + chunk_size = size - offset; + } + + uint32_t src_bo_idx, dst_bo_idx; + uint64_t src_bo_off, dst_bo_off; + xrt_buffer_t src_buf, dst_buf; + + CHECK_ERR(this->get_bank_info(curr_src, &src_bo_idx, &src_bo_off), { + return err; + }); +#ifdef BANK_INTERLEAVE + src_bo_off += (curr_src % CACHE_BLOCK_SIZE); +#endif + + CHECK_ERR(this->get_buffer(src_bo_idx, &src_buf), { + return err; + }); + + CHECK_ERR(this->get_bank_info(curr_dest, &dst_bo_idx, &dst_bo_off), { + return err; + }); +#ifdef BANK_INTERLEAVE + dst_bo_off += (curr_dest % CACHE_BLOCK_SIZE); +#endif + + CHECK_ERR(this->get_buffer(dst_bo_idx, &dst_buf), { + return err; + }); + +#ifdef CPP_API + dst_buf.copy(src_buf, chunk_size, src_bo_off, dst_bo_off); +#else + CHECK_ERR(xrtBOCopy(dst_buf, src_buf, chunk_size, src_bo_off, dst_bo_off), { + dump_xrt_error(xrtDevice_, err); + return err; + }); +#endif + + offset += chunk_size; + } + + return 0; + } + int upload(uint64_t dev_addr, const void *src, uint64_t size) { auto host_ptr = (const uint8_t *)src; diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index 96b08ff8a2..8684d493cf 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -499,6 +499,24 @@ void RAM::write(const void* data, uint64_t addr, uint64_t size) { } } +void RAM::copy(uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + if (check_acl_) { + if (acl_mngr_.check(src_addr, size, 0x1) == false || + acl_mngr_.check(dest_addr, size, 0x2) == false) { + throw BadAddress(); + } + } + if (dest_addr > src_addr) { + for (uint64_t i = 0; i < size; i++) { + *this->get(dest_addr + i) = *this->get(src_addr + i); + } + } else if (dest_addr < src_addr) { + for (uint64_t i = size; i > 0; i--) { + *this->get(dest_addr + i - 1) = *this->get(src_addr + i - 1); + } + } +} + void RAM::set_acl(uint64_t addr, uint64_t size, int flags) { if (capacity_ != 0 && (addr + size)> capacity_) { throw OutOfRange(); diff --git a/sim/common/mem.h b/sim/common/mem.h index 5587adcdb3..5117e8d7ee 100644 --- a/sim/common/mem.h +++ b/sim/common/mem.h @@ -370,6 +370,7 @@ class RAM : public MemDevice { void read(void* data, uint64_t addr, uint64_t size) override; void write(const void* data, uint64_t addr, uint64_t size) override; + void copy (uint64_t dest_addr, uint64_t src_addr, uint64_t size); void loadBinImage(const char* filename, uint64_t destination); void loadHexImage(const char* filename); diff --git a/sim/opaesim/fpga.cpp b/sim/opaesim/fpga.cpp index d16ef97a15..e2a0382642 100644 --- a/sim/opaesim/fpga.cpp +++ b/sim/opaesim/fpga.cpp @@ -152,6 +152,16 @@ extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_ return FPGA_OK; } +extern fpga_result fpgaCopyBuffer(fpga_handle handle, uint64_t dest, uint64_t src, uint64_t size) { + if (NULL == handle) + return FPGA_INVALID_PARAM; + + auto sim = reinterpret_cast(handle); + sim->copy(dest, src, size); + + return FPGA_OK; +} + extern const char *fpgaErrStr(fpga_result e) { return ""; } diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index fe4d61857b..5358737a87 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -238,6 +238,13 @@ class opae_sim::Impl { device_->vcp2af_sRxPort_c0_mmioWrValid = 0; } + void copy(uint64_t dest, uint64_t src, uint64_t size) { + + std::lock_guard guard(mutex_); + + ram_->copy(dest, src, size); + } + private: void reset() { @@ -565,6 +572,10 @@ void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) impl_->write_mmio64(mmio_num, offset, value); } +void opae_sim::copy(uint64_t dest, uint64_t src, uint64_t size) { + impl_->copy(dest, src, size); +} + void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) { impl_->read_mmio64(mmio_num, offset, value); } diff --git a/sim/opaesim/opae_sim.h b/sim/opaesim/opae_sim.h index 454cc1bf74..78458a95c1 100644 --- a/sim/opaesim/opae_sim.h +++ b/sim/opaesim/opae_sim.h @@ -37,6 +37,8 @@ class opae_sim { void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value); + void copy(uint64_t dest, uint64_t src, uint64_t size); + private: class Impl; diff --git a/sim/xrtsim/xrt_c.cpp b/sim/xrtsim/xrt_c.cpp index ea457c11ee..b5410e74ce 100644 --- a/sim/xrtsim/xrt_c.cpp +++ b/sim/xrtsim/xrt_c.cpp @@ -109,6 +109,21 @@ extern int xrtBORead(xrtBufferHandle bhdl, void* dst, size_t size, size_t offset return buffer->sim->mem_read(buffer->bank, buffer->addr + offset, size, dst); } +extern int xrtBOCopy(xrtBufferHandle dst, xrtBufferHandle src, size_t size, size_t src_offset, size_t dst_offset) { + if (dst == nullptr || src == nullptr) + return -1; + auto dst_buffer = reinterpret_cast(dst); + auto src_buffer = reinterpret_cast(src); + int err = dst_buffer->sim->mem_copy( + dst_buffer->bank, + src_buffer->bank, + dst_buffer->addr + dst_offset, + src_buffer->addr + src_offset, + size + ); + return err; +} + extern int xrtBOSync(xrtBufferHandle bhdl, enum xclBOSyncDirection dir, size_t size, size_t offset) { return 0; } diff --git a/sim/xrtsim/xrt_c.h b/sim/xrtsim/xrt_c.h index 0dbd5cf42d..c439af5829 100644 --- a/sim/xrtsim/xrt_c.h +++ b/sim/xrtsim/xrt_c.h @@ -98,6 +98,8 @@ int xrtBOWrite(xrtBufferHandle bhdl, const void* src, size_t size, size_t offset int xrtBORead(xrtBufferHandle bhdl, void* dst, size_t size, size_t offset); +int xrtBOCopy(xrtBufferHandle dst, xrtBufferHandle src, size_t size, size_t src_offset, size_t dst_offset); + int xrtBOSync(xrtBufferHandle bhdl, enum xclBOSyncDirection dir, size_t size, size_t offset); int xrtKernelWriteRegister(xrtKernelHandle kernelHandle, uint32_t offset, uint32_t data); diff --git a/sim/xrtsim/xrt_sim.cpp b/sim/xrtsim/xrt_sim.cpp index 76f156ef7c..bb5a431be9 100644 --- a/sim/xrtsim/xrt_sim.cpp +++ b/sim/xrtsim/xrt_sim.cpp @@ -248,6 +248,16 @@ class xrt_sim::Impl { return 0; } + int mem_copy(uint32_t bank_id_dest , uint32_t bank_id_src, uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + std::lock_guard guard(mutex_); + if( bank_id_dest >= PLATFORM_MEMORY_NUM_BANKS || bank_id_src >= PLATFORM_MEMORY_NUM_BANKS) + return -1; + uint64_t dest_base_addr = bank_id_dest * mem_bank_size_ + dest_addr; + uint64_t src_base_addr = bank_id_src * mem_bank_size_ + src_addr; + ram_->copy(dest_base_addr, src_base_addr, size); + return 0; + } + int register_write(uint32_t offset, uint32_t value) { std::lock_guard guard(mutex_); @@ -649,6 +659,10 @@ int xrt_sim::mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data return impl_->mem_read(bank_id, addr, size, data); } +int xrt_sim::mem_copy(uint32_t bank_id_dest , uint32_t bank_id_src, uint64_t dest_addr, uint64_t src_addr, uint64_t size) { + return impl_->mem_copy(bank_id_dest, bank_id_src, dest_addr, src_addr, size); +} + int xrt_sim::register_write(uint32_t offset, uint32_t value) { return impl_->register_write(offset, value); } diff --git a/sim/xrtsim/xrt_sim.h b/sim/xrtsim/xrt_sim.h index 6a2d5d7da4..c2b9975bb1 100644 --- a/sim/xrtsim/xrt_sim.h +++ b/sim/xrtsim/xrt_sim.h @@ -33,6 +33,8 @@ class xrt_sim { int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* value); + int mem_copy(uint32_t bank_id_dest , uint32_t bank_id_src, uint64_t dest_addr, uint64_t src_addr, uint64_t size); + int register_write(uint32_t offset, uint32_t value); int register_read(uint32_t offset, uint32_t* value); diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index e60cd6ec74..186a451022 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -22,6 +22,8 @@ all: $(MAKE) -C kmeans $(MAKE) -C blackscholes $(MAKE) -C bfs + $(MAKE) -C copybuf + run-simx: $(MAKE) -C vecadd run-simx @@ -43,6 +45,9 @@ run-simx: $(MAKE) -C kmeans run-simx $(MAKE) -C blackscholes run-simx $(MAKE) -C bfs run-simx + $(MAKE) -C copybuf run-simx + + run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -64,6 +69,7 @@ run-rtlsim: $(MAKE) -C kmeans run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C bfs run-rtlsim + $(MAKE) -C copybuf run-rtlsim clean: $(MAKE) -C vecadd clean @@ -85,4 +91,5 @@ clean: $(MAKE) -C guassian clean $(MAKE) -C kmeans clean $(MAKE) -C blackscholes clean - $(MAKE) -C bfs clean \ No newline at end of file + $(MAKE) -C bfs clean + $(MAKE) -C copybuf clean \ No newline at end of file diff --git a/tests/opencl/copybuf/Makefile b/tests/opencl/copybuf/Makefile new file mode 100644 index 0000000000..f10a02695a --- /dev/null +++ b/tests/opencl/copybuf/Makefile @@ -0,0 +1,11 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := copybuf + +SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cc + + +include ../common.mk diff --git a/tests/opencl/copybuf/main.cc b/tests/opencl/copybuf/main.cc new file mode 100644 index 0000000000..46164bea1a --- /dev/null +++ b/tests/opencl/copybuf/main.cc @@ -0,0 +1,193 @@ +#define CL_TARGET_OPENCL_VERSION 120 +#include +#include +#include +#include + +#define CHECK_ERR(err, msg) \ + if (err != CL_SUCCESS) { \ + fprintf(stderr, "%s (Error code: %d)\n", msg, err); \ + return -1; \ + } + +int verify_result(const char *test_name, int *result, int *expected, int size) { + for (int i = 0; i < size; i++) { + if (result[i] != expected[i]) { + printf("[%s] FAILED at index %d: expected %d, got %d\n", test_name, i, expected[i], result[i]); + return -1; + } + } + printf("[%s] PASSED\n", test_name); + return 0; +} + +int test_basic_copy(cl_context context, cl_command_queue queue) { + cl_int err; + int src_data[] = {10, 20, 30, 40, 50}; + int dst_data[5] = {0}; + size_t size = sizeof(src_data); + + cl_mem bufSrc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, src_data, &err); + CHECK_ERR(err, "Basic: Failed to create src buffer"); + cl_mem bufDst = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + CHECK_ERR(err, "Basic: Failed to create dst buffer"); + + err = clEnqueueCopyBuffer(queue, bufSrc, bufDst, 0, 0, size, 0, NULL, NULL); + CHECK_ERR(err, "Basic: Failed to enqueue copy"); + + err = clEnqueueReadBuffer(queue, bufDst, CL_TRUE, 0, size, dst_data, 0, NULL, NULL); + CHECK_ERR(err, "Basic: Failed to read buffer"); + + clReleaseMemObject(bufSrc); + clReleaseMemObject(bufDst); + + return verify_result("Basic Copy", dst_data, src_data, 5); +} + +int test_offset_copy(cl_context context, cl_command_queue queue) { + cl_int err; + int src_data[] = {1, 2, 3, 4, 5, 6, 7, 8}; + int dst_data[8] = {0}; + + size_t total_size = sizeof(src_data); + size_t copy_count = 3; + size_t copy_size = copy_count * sizeof(int); + size_t src_offset = 2 * sizeof(int); + size_t dst_offset = 4 * sizeof(int); + + cl_mem bufSrc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, total_size, src_data, &err); + CHECK_ERR(err, "Offset: Failed to create src buffer"); + cl_mem bufDst = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, total_size, dst_data, &err); + CHECK_ERR(err, "Offset: Failed to create dst buffer"); + + err = clEnqueueCopyBuffer(queue, bufSrc, bufDst, src_offset, dst_offset, copy_size, 0, NULL, NULL); + CHECK_ERR(err, "Offset: Failed to enqueue copy"); + + err = clEnqueueReadBuffer(queue, bufDst, CL_TRUE, 0, total_size, dst_data, 0, NULL, NULL); + CHECK_ERR(err, "Offset: Failed to read buffer"); + + int expected[] = {0, 0, 0, 0, 3, 4, 5, 0}; + + clReleaseMemObject(bufSrc); + clReleaseMemObject(bufDst); + + return verify_result("Offset Copy", dst_data, expected, 8); +} + +int test_self_copy_no_overlap(cl_context context, cl_command_queue queue) { + cl_int err; + // [10, 20, 30, 0, 0, 0] + int data[] = {10, 20, 30, 0, 0, 0}; + size_t size = sizeof(data); + + cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, data, &err); + CHECK_ERR(err, "Self: Failed to create buffer"); + + err = clEnqueueCopyBuffer(queue, buf, buf, 0, 3 * sizeof(int), 3 * sizeof(int), 0, NULL, NULL); + CHECK_ERR(err, "Self: Failed to enqueue copy"); + + err = clEnqueueReadBuffer(queue, buf, CL_TRUE, 0, size, data, 0, NULL, NULL); + CHECK_ERR(err, "Self: Failed to read buffer"); + + int expected[] = {10, 20, 30, 10, 20, 30}; + + clReleaseMemObject(buf); + + return verify_result("Self Copy (No Overlap)", data, expected, 6); +} + +int test_large_copy(cl_context context, cl_command_queue queue) { + cl_int err; + const int count = 1024 * 10; + size_t size = count * sizeof(int); + + int *src_data = (int *)malloc(size); + int *dst_data = (int *)malloc(size); + + for (int i = 0; i < count; ++i) { + src_data[i] = i; + dst_data[i] = 0; + } + + cl_mem bufSrc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, src_data, &err); + CHECK_ERR(err, "Large: Failed to create src buffer"); + cl_mem bufDst = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + CHECK_ERR(err, "Large: Failed to create dst buffer"); + + err = clEnqueueCopyBuffer(queue, bufSrc, bufDst, 0, 0, size, 0, NULL, NULL); + CHECK_ERR(err, "Large: Failed to enqueue copy"); + + err = clEnqueueReadBuffer(queue, bufDst, CL_TRUE, 0, size, dst_data, 0, NULL, NULL); + CHECK_ERR(err, "Large: Failed to read buffer"); + + int res = verify_result("Large Copy", dst_data, src_data, count); + + clReleaseMemObject(bufSrc); + clReleaseMemObject(bufDst); + free(src_data); + free(dst_data); + + return res; +} + +int main() { + cl_int err; + cl_uint num_platforms = 0; + err = clGetPlatformIDs(0, NULL, &num_platforms); + if (err != CL_SUCCESS || num_platforms == 0) { + fprintf(stderr, "No OpenCL platform\n"); + return -1; + } + cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); + clGetPlatformIDs(num_platforms, platforms, NULL); + + cl_uint num_devices = 0; + err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + if (err != CL_SUCCESS || num_devices == 0) { + fprintf(stderr, "No OpenCL device\n"); + free(platforms); + return -1; + } + + cl_device_id *devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices); + clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + + cl_context context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "Failed to create context\n"); + return -1; + } + + cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "Failed to create queue\n"); + return -1; + } + + printf("Running OpenCL CopyBuffer Tests...\n"); + printf("----------------------------------\n"); + + int failures = 0; + if (test_basic_copy(context, queue) != 0) + failures++; + if (test_offset_copy(context, queue) != 0) + failures++; + if (test_self_copy_no_overlap(context, queue) != 0) + failures++; + if (test_large_copy(context, queue) != 0) + failures++; + + printf("----------------------------------\n"); + if (failures == 0) { + printf("ALL TESTS PASSED\n"); + } else { + printf("%d TESTS FAILED\n", failures); + } + + clReleaseCommandQueue(queue); + clReleaseContext(context); + free(devices); + free(platforms); + + return failures; +} \ No newline at end of file