From 756d333a110db7e1c2a7b26daa6a3c25d78bb627 Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Tue, 5 May 2026 11:27:09 -0500
Subject: [PATCH 01/12] Move to LTS stack

---
 Project.toml                     |  12 +-
 deps/Project.toml                |   6 +-
 deps/build_local.jl              |   5 +-
 deps/generate_interfaces.jl      |  26 ++-
 deps/src/onemkl.cpp              | 200 ++++++++++++++----
 deps/src/onemkl.h                | 152 ++++++++++----
 lib/level-zero/oneL0.jl          |   6 +-
 lib/support/liboneapi_support.jl | 348 +++++++++++++++++++++++++------
 res/Project.toml                 |   2 +-
 res/wrap.jl                      |   2 +-
 src/oneAPI.jl                    |   2 +-
 src/utils.jl                     |   2 +-
 test/Project.toml                |   4 +-
 13 files changed, 606 insertions(+), 161 deletions(-)

diff --git a/Project.toml b/Project.toml
index f84b9c27..c997a054 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,7 +16,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NEO_jll = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
+NEO_LTS_jll = "a724f90f-ce79-56dd-a1bd-b9de5a61085f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -26,8 +26,8 @@ SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
-oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01"
+oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314"
+oneAPI_Level_Zero_Loader_LTS_jll = "f6e5cbb4-ba2a-56dc-92a2-9d66f5656ccd"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
 
 [compat]
@@ -41,7 +41,7 @@ GPUCompiler = "1.6"
 GPUToolbox = "0.1, 0.2, 0.3, 1"
 KernelAbstractions = "0.9.39"
 LLVM = "6, 7, 8, 9"
-NEO_jll = "=25.44.36015"
+NEO_LTS_jll = "=25.18.33578"
 Preferences = "1"
 SPIRVIntrinsics = "0.5"
 SPIRV_LLVM_Translator_jll = "21"
@@ -49,8 +49,8 @@ SPIRV_Tools_jll = "2025.4.0"
 SpecialFunctions = "1.3, 2"
 StaticArrays = "1"
 julia = "1.10"
-oneAPI_Level_Zero_Loader_jll = "1.25"
+oneAPI_Level_Zero_Loader_LTS_jll = "=1.24"
 oneAPI_Support_jll = "0.9.2"
 
 [extras]
-libigc_jll = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
+libigc_LTS_jll = "9a8258a1-e827-5686-bee9-144461246960"
diff --git a/deps/Project.toml b/deps/Project.toml
index b8446dbe..8833df53 100644
--- a/deps/Project.toml
+++ b/deps/Project.toml
@@ -8,8 +8,8 @@ Ninja_jll = "76642167-d241-5cee-8c94-7a494e8cb7b7"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
-oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
-oneAPI_Support_Headers_jll = "24f86df5-245d-5634-a4cc-32433d9800b3"
+oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314"
+oneAPI_Support_Headers_LTS_jll = "0e9de0da-c0b6-5d6c-9871-5c996d414ca7"
 
 [compat]
-oneAPI_Support_Headers_jll = "=2025.2.0"
+oneAPI_Support_Headers_LTS_jll = "=2025.3.1"
diff --git a/deps/build_local.jl b/deps/build_local.jl
index f2d620e0..4d613732 100644
--- a/deps/build_local.jl
+++ b/deps/build_local.jl
@@ -20,7 +20,8 @@ if haskey(ENV, "BUILDKITE")
     run(`buildkite-agent annotate 'Using a locally-built support library; A bump of oneAPI_Support_jll is required before releasing this packages.' --style 'warning' --context 'ctx-deps'`)
 end
 
-using Scratch, Preferences, CMake_jll, Ninja_jll, oneAPI_Level_Zero_Headers_jll
+using Scratch, Preferences, CMake_jll, Ninja_jll
+import oneAPI_Level_Zero_Headers_LTS_jll as oneAPI_Level_Zero_Headers_jll
 
 oneAPI = Base.UUID("8f75cd03-7ff8-4ecb-9b8f-daf728133b1b")
 
@@ -62,7 +63,7 @@ if !isfile(joinpath(conda_dir, "condarc-julia.yml"))
     touch(joinpath(conda_dir, "conda-meta", "history"))
 end
 Conda.add_channel("https://software.repos.intel.com/python/conda/", conda_dir)
-Conda.add(["dpcpp_linux-64=2025.2.0", "mkl-devel-dpcpp=2025.2.0"], conda_dir)
+Conda.add(["dpcpp_linux-64=2025.3.1", "mkl-devel-dpcpp=2025.3.1"], conda_dir)
 
 Conda.list(conda_dir)
 
diff --git a/deps/generate_interfaces.jl b/deps/generate_interfaces.jl
index 108518db..8c62b757 100644
--- a/deps/generate_interfaces.jl
+++ b/deps/generate_interfaces.jl
@@ -1,4 +1,4 @@
-using oneAPI_Support_Headers_jll
+import oneAPI_Support_Headers_LTS_jll as oneAPI_Support_Headers_jll
 
 include("generate_helpers.jl")
 
@@ -337,12 +337,34 @@ function generate_headers(library::String, filename::Vector{String}, output::Str
     end
   end
 
+  # Dedup: when two signatures map to the same C function name (because MKL
+  # added an overload), keep the one with more parameters — typically the
+  # newer signature (e.g. set_csr_data gained an `nnz` arg in MKL 2025.3.1).
+  # Without this the generated onemkl.cpp has duplicate function definitions
+  # and won't compile.
+  _fn_name(h) = (pos = findfirst('(', h); strip(split(strip(h[1:pos-1]))[end]))
+  _param_cnt(h) = (pos = findfirst('(', h); ep = findnext(')', h, pos); count(==(','), h[pos+1:ep-1]) + 1)
+  keep_idx = Dict{String,Int}()
+  keep_pc  = Dict{String,Int}()
+  for (i, sig) in enumerate(signatures)
+    (sig[2] in blacklist) && continue
+    fn = _fn_name(sig[1])
+    pc = _param_cnt(sig[1])
+    if !haskey(keep_idx, fn) || pc > keep_pc[fn]
+      keep_idx[fn] = i
+      keep_pc[fn]  = pc
+    end
+  end
+  keep_set = Set(values(keep_idx))
+
   path_oneapi_headers = joinpath(@__DIR__, output)
   oneapi_headers = open(path_oneapi_headers, "w")
 
-  for (header, name_routine, version, type_routine, template) in signatures
+  for (i, (header, name_routine, version, type_routine, template)) in enumerate(signatures)
     # Blacklist
     (name_routine in blacklist) && continue
+    # Dedup
+    (i in keep_set) || continue
 
     # Pass scalars (e.g. alpha/beta inputs) as references instead of values
     for type in ("short", "float", "double", "float _Complex", "double _Complex")
diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp
index 2b880327..f9479c2f 100644
--- a/deps/src/onemkl.cpp
+++ b/deps/src/onemkl.cpp
@@ -5471,142 +5471,270 @@ extern "C" int64_t onemklZunmqr_batch_scratchpad_size(syclQueue_t device_queue,
 }
 
 // SPARSE
-extern "C" int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spMat) {
-   oneapi::mkl::sparse::init_matrix_handle((oneapi::mkl::sparse::matrix_handle_t*) p_spMat);
+extern "C" int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spmat) {
+   oneapi::mkl::sparse::init_matrix_handle((oneapi::mkl::sparse::matrix_handle_t*) p_spmat);
    return 0;
 }
 
-extern "C" int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spMat) {
+extern "C" int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spmat) {
    try {
-      auto status = oneapi::mkl::sparse::release_matrix_handle(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t*) p_spMat, {});
+      auto status = oneapi::mkl::sparse::release_matrix_handle(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t*) p_spmat, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float *values) {
+extern "C" int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float *values) {
+extern "C" int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double *values) {
+extern "C" int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double *values) {
+extern "C" int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float _Complex *values) {
+extern "C" int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float _Complex *values) {
+extern "C" int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double _Complex *values) {
+extern "C" int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double _Complex *values) {
+extern "C" int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float *values) {
+extern "C" int onemklSsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, float *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float *values) {
+extern "C" int onemklSsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, float *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double *values) {
+extern "C" int onemklDsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, double *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double *values) {
+extern "C" int onemklDsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, double *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float _Complex *values) {
+extern "C" int onemklCsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, float _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast<std::complex<float>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float _Complex *values) {
+extern "C" int onemklCsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, float _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast<std::complex<float>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double _Complex *values) {
+extern "C" int onemklZsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, double _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast<std::complex<double>*>(values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
 }
 
-extern "C" int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double _Complex *values) {
+extern "C" int onemklZsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, double _Complex *values) {
    try {
-      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float _Complex *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float _Complex *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<float>*>(values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double _Complex *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double _Complex *values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast<std::complex<double>*>(values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklSsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, float *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklSsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, float *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklDsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, double *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklDsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, double *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklCsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, float _Complex *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast<std::complex<float>*>(bsr_values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklCsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, float _Complex *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast<std::complex<float>*>(bsr_values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklZsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, double _Complex *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast<std::complex<double>*>(bsr_values), {});
+      device_queue->val.wait_and_throw();
+   } catch (const sycl::exception& e) { return -1; }
+   return 0;
+}
+
+extern "C" int onemklZsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, double _Complex *bsr_values) {
+   try {
+      auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast<std::complex<double>*>(bsr_values), {});
       device_queue->val.wait_and_throw();
    } catch (const sycl::exception& e) { return -1; }
    return 0;
diff --git a/deps/src/onemkl.h b/deps/src/onemkl.h
index 0029fb2e..4d1caa08 100644
--- a/deps/src/onemkl.h
+++ b/deps/src/onemkl.h
@@ -141,8 +141,8 @@ struct omatadd_descr;
 typedef struct omatadd_descr *omatadd_descr_t;
 
 const int64_t ONEMKL_VERSION_MAJOR = 2025;
-const int64_t ONEMKL_VERSION_MINOR = 2;
-const int64_t ONEMKL_VERSION_PATCH = 0;
+const int64_t ONEMKL_VERSION_MINOR = 3;
+const int64_t ONEMKL_VERSION_PATCH = 1;
 void onemkl_version(int64_t *major, int64_t *minor, int64_t *patch);
 
 int onemklHgemm_batch(syclQueue_t device_queue, onemklTranspose transa,
@@ -2737,74 +2737,154 @@ int64_t onemklZunmqr_batch_scratchpad_size(syclQueue_t device_queue, onemklSide*
                                            group_count, int64_t* group_sizes);
 
 // SPARSE
-int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spMat);
+int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spmat);
 
-int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spMat);
+int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spmat);
 
-int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
-                               int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t
-                               *col_ind, float *values);
+int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr,
+                               int32_t *col_ind, float *values);
 
-int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
-                                  nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr,
-                                  int64_t *col_ind, float *values);
+int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *row_ptr, int64_t *col_ind, float *values);
 
-int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
-                               int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t
-                               *col_ind, double *values);
+int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr,
+                               int32_t *col_ind, double *values);
 
-int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
-                                  nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr,
-                                  int64_t *col_ind, double *values);
+int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *row_ptr, int64_t *col_ind, double *values);
 
-int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
-                               int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t
-                               *col_ind, float _Complex *values);
+int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr,
+                               int32_t *col_ind, float _Complex *values);
 
-int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
-                                  nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr,
-                                  int64_t *col_ind, float _Complex *values);
+int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *row_ptr, int64_t *col_ind, float _Complex *values);
 
-int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
-                               int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t
-                               *col_ind, double _Complex *values);
+int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr,
+                               int32_t *col_ind, double _Complex *values);
+
+int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *row_ptr, int64_t *col_ind, double _Complex *values);
 
-int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
-                                  nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr,
-                                  int64_t *col_ind, double _Complex *values);
+int onemklSsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr,
+                               int32_t *row_ind, float *values);
+
+int onemklSsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *col_ptr, int64_t *row_ind, float *values);
 
-int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
+int onemklDsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr,
+                               int32_t *row_ind, double *values);
+
+int onemklDsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *col_ptr, int64_t *row_ind, double *values);
+
+int onemklCsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr,
+                               int32_t *row_ind, float _Complex *values);
+
+int onemklCsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *col_ptr, int64_t *row_ind, float _Complex *values);
+
+int onemklZsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows,
+                               int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr,
+                               int32_t *row_ind, double _Complex *values);
+
+int onemklZsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+                                  nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
+                                  *col_ptr, int64_t *row_ind, double _Complex *values);
+
+int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows,
                                int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind,
                                int32_t *col_ind, float *values);
 
-int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
                                   nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
                                   *row_ind, int64_t *col_ind, float *values);
 
-int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
+int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows,
                                int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind,
                                int32_t *col_ind, double *values);
 
-int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
                                   nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
                                   *row_ind, int64_t *col_ind, double *values);
 
-int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
+int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows,
                                int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind,
                                int32_t *col_ind, float _Complex *values);
 
-int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
                                   nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
                                   *row_ind, int64_t *col_ind, float _Complex *values);
 
-int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows,
+int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows,
                                int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind,
                                int32_t *col_ind, double _Complex *values);
 
-int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t
+int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
                                   nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t
                                   *row_ind, int64_t *col_ind, double _Complex *values);
 
+int onemklSsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                               blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                               row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                               onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind,
+                               float *bsr_values);
+
+int onemklSsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                                  row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                                  onemklIndex index, int64_t *bsr_row_ptr, int64_t
+                                  *bsr_col_ind, float *bsr_values);
+
+int onemklDsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                               blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                               row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                               onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind,
+                               double *bsr_values);
+
+int onemklDsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                                  row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                                  onemklIndex index, int64_t *bsr_row_ptr, int64_t
+                                  *bsr_col_ind, double *bsr_values);
+
+int onemklCsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                               blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                               row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                               onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind,
+                               float _Complex *bsr_values);
+
+int onemklCsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                                  row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                                  onemklIndex index, int64_t *bsr_row_ptr, int64_t
+                                  *bsr_col_ind, float _Complex *bsr_values);
+
+int onemklZsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                               blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                               row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                               onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind,
+                               double _Complex *bsr_values);
+
+int onemklZsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t
+                                  blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t
+                                  row_blk_size, int64_t col_blk_size, onemklLayout blk_layout,
+                                  onemklIndex index, int64_t *bsr_row_ptr, int64_t
+                                  *bsr_col_ind, double _Complex *bsr_values);
+
 int onemklXsparse_init_matmat_descr(matmat_descr_t *p_desc);
 
 int onemklXsparse_release_matmat_descr(matmat_descr_t *p_desc);
diff --git a/lib/level-zero/oneL0.jl b/lib/level-zero/oneL0.jl
index f9697029..6bcb17fb 100644
--- a/lib/level-zero/oneL0.jl
+++ b/lib/level-zero/oneL0.jl
@@ -11,8 +11,10 @@ using Libdl
 if Sys.iswindows()
     const libze_loader = "ze_loader"
 else
-    using NEO_jll
-    using oneAPI_Level_Zero_Loader_jll
+    using NEO_LTS_jll
+    using oneAPI_Level_Zero_Loader_LTS_jll
+    const NEO_jll = NEO_LTS_jll
+    const oneAPI_Level_Zero_Loader_jll = oneAPI_Level_Zero_Loader_LTS_jll
 end
 
 include("utils.jl")
diff --git a/lib/support/liboneapi_support.jl b/lib/support/liboneapi_support.jl
index 91146c99..5f30a9f1 100644
--- a/lib/support/liboneapi_support.jl
+++ b/lib/support/liboneapi_support.jl
@@ -6428,107 +6428,195 @@ function onemklZunmqr_batch_scratchpad_size(device_queue, side, trans, m, n, k,
                                                                 group_sizes::Ptr{Int64})::Int64
 end
 
-function onemklXsparse_init_matrix_handle(p_spMat)
-    @ccall liboneapi_support.onemklXsparse_init_matrix_handle(p_spMat::Ptr{matrix_handle_t})::Cint
+function onemklXsparse_init_matrix_handle(p_spmat)
+    @ccall liboneapi_support.onemklXsparse_init_matrix_handle(p_spmat::Ptr{matrix_handle_t})::Cint
 end
 
-function onemklXsparse_release_matrix_handle(device_queue, p_spMat)
+function onemklXsparse_release_matrix_handle(device_queue, p_spmat)
     @ccall liboneapi_support.onemklXsparse_release_matrix_handle(device_queue::syclQueue_t,
-                                                                 p_spMat::Ptr{matrix_handle_t})::Cint
+                                                                 p_spmat::Ptr{matrix_handle_t})::Cint
 end
 
-function onemklSsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr,
+function onemklSsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
                                     col_ind, values)
     @ccall liboneapi_support.onemklSsparse_set_csr_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
-                                                        nrows::Int32, ncols::Int32,
-                                                        index::onemklIndex,
+                                                        spmat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::ZePtr{Int32},
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Int32},
-                                                        values::ZePtr{Cfloat})::Cint
+                                                        col_ind::ZePtr{Cfloat},
+                                                        values::Ptr{Cfloat})::Cint
 end
 
-function onemklSsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr,
-                                       col_ind, values)
+function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
+                                       row_ptr, col_ind, values)
     @ccall liboneapi_support.onemklSsparse_set_csr_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           index::onemklIndex,
+                                                           nnz::Int64, index::ZePtr{Int64},
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Int64},
-                                                           values::ZePtr{Cfloat})::Cint
+                                                           col_ind::ZePtr{Cfloat},
+                                                           values::Ptr{Cfloat})::Cint
 end
 
-function onemklDsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr,
+function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
                                     col_ind, values)
     @ccall liboneapi_support.onemklDsparse_set_csr_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
-                                                        nrows::Int32, ncols::Int32,
-                                                        index::onemklIndex,
+                                                        spmat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::ZePtr{Int32},
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Int32},
-                                                        values::ZePtr{Cdouble})::Cint
+                                                        col_ind::ZePtr{Cdouble},
+                                                        values::Ptr{Cdouble})::Cint
 end
 
-function onemklDsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr,
-                                       col_ind, values)
+function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
+                                       row_ptr, col_ind, values)
     @ccall liboneapi_support.onemklDsparse_set_csr_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           index::onemklIndex,
+                                                           nnz::Int64, index::ZePtr{Int64},
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Int64},
-                                                           values::ZePtr{Cdouble})::Cint
+                                                           col_ind::ZePtr{Cdouble},
+                                                           values::Ptr{Cdouble})::Cint
 end
 
-function onemklCsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr,
+function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
                                     col_ind, values)
     @ccall liboneapi_support.onemklCsparse_set_csr_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
-                                                        nrows::Int32, ncols::Int32,
-                                                        index::onemklIndex,
+                                                        spmat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::ZePtr{Int32},
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Int32},
-                                                        values::ZePtr{ComplexF32})::Cint
+                                                        col_ind::ZePtr{ComplexF32},
+                                                        values::Ptr{ComplexF32})::Cint
 end
 
-function onemklCsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr,
-                                       col_ind, values)
+function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
+                                       row_ptr, col_ind, values)
     @ccall liboneapi_support.onemklCsparse_set_csr_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           index::onemklIndex,
+                                                           nnz::Int64, index::ZePtr{Int64},
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Int64},
-                                                           values::ZePtr{ComplexF32})::Cint
+                                                           col_ind::ZePtr{ComplexF32},
+                                                           values::Ptr{ComplexF32})::Cint
 end
 
-function onemklZsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr,
+function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
                                     col_ind, values)
     @ccall liboneapi_support.onemklZsparse_set_csr_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
-                                                        nrows::Int32, ncols::Int32,
-                                                        index::onemklIndex,
+                                                        spmat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::ZePtr{Int32},
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Int32},
-                                                        values::ZePtr{ComplexF64})::Cint
+                                                        col_ind::ZePtr{ComplexF64},
+                                                        values::Ptr{ComplexF32})::Cint
 end
 
-function onemklZsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr,
-                                       col_ind, values)
+function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
+                                       row_ptr, col_ind, values)
     @ccall liboneapi_support.onemklZsparse_set_csr_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           index::onemklIndex,
+                                                           nnz::Int64, index::ZePtr{Int64},
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Int64},
-                                                           values::ZePtr{ComplexF64})::Cint
+                                                           col_ind::ZePtr{ComplexF64},
+                                                           values::Ptr{ComplexF32})::Cint
+end
+
+function onemklSsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr,
+                                    row_ind, values)
+    @ccall liboneapi_support.onemklSsparse_set_csc_data(device_queue::syclQueue_t,
+                                                        spMat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::onemklIndex,
+                                                        col_ptr::Ptr{Int32},
+                                                        row_ind::Ptr{Int32},
+                                                        values::Ptr{Cfloat})::Cint
+end
+
+function onemklSsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+                                       col_ptr, row_ind, values)
+    @ccall liboneapi_support.onemklSsparse_set_csc_data_64(device_queue::syclQueue_t,
+                                                           spMat::matrix_handle_t,
+                                                           nrows::Int64, ncols::Int64,
+                                                           nnz::Int64, index::onemklIndex,
+                                                           col_ptr::Ptr{Int64},
+                                                           row_ind::Ptr{Int64},
+                                                           values::Ptr{Cfloat})::Cint
+end
+
+function onemklDsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr,
+                                    row_ind, values)
+    @ccall liboneapi_support.onemklDsparse_set_csc_data(device_queue::syclQueue_t,
+                                                        spMat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::onemklIndex,
+                                                        col_ptr::Ptr{Int32},
+                                                        row_ind::Ptr{Int32},
+                                                        values::Ptr{Cdouble})::Cint
+end
+
+function onemklDsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+                                       col_ptr, row_ind, values)
+    @ccall liboneapi_support.onemklDsparse_set_csc_data_64(device_queue::syclQueue_t,
+                                                           spMat::matrix_handle_t,
+                                                           nrows::Int64, ncols::Int64,
+                                                           nnz::Int64, index::onemklIndex,
+                                                           col_ptr::Ptr{Int64},
+                                                           row_ind::Ptr{Int64},
+                                                           values::Ptr{Cdouble})::Cint
+end
+
+function onemklCsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr,
+                                    row_ind, values)
+    @ccall liboneapi_support.onemklCsparse_set_csc_data(device_queue::syclQueue_t,
+                                                        spMat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::onemklIndex,
+                                                        col_ptr::Ptr{Int32},
+                                                        row_ind::Ptr{Int32},
+                                                        values::Ptr{ComplexF32})::Cint
+end
+
+function onemklCsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+                                       col_ptr, row_ind, values)
+    @ccall liboneapi_support.onemklCsparse_set_csc_data_64(device_queue::syclQueue_t,
+                                                           spMat::matrix_handle_t,
+                                                           nrows::Int64, ncols::Int64,
+                                                           nnz::Int64, index::onemklIndex,
+                                                           col_ptr::Ptr{Int64},
+                                                           row_ind::Ptr{Int64},
+                                                           values::Ptr{ComplexF32})::Cint
 end
 
-function onemklSsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind,
+function onemklZsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr,
+                                    row_ind, values)
+    @ccall liboneapi_support.onemklZsparse_set_csc_data(device_queue::syclQueue_t,
+                                                        spMat::matrix_handle_t,
+                                                        nrows::Int64, ncols::Int64,
+                                                        nnz::Int64, index::onemklIndex,
+                                                        col_ptr::Ptr{Int32},
+                                                        row_ind::Ptr{Int32},
+                                                        values::Ptr{ComplexF32})::Cint
+end
+
+function onemklZsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+                                       col_ptr, row_ind, values)
+    @ccall liboneapi_support.onemklZsparse_set_csc_data_64(device_queue::syclQueue_t,
+                                                           spMat::matrix_handle_t,
+                                                           nrows::Int64, ncols::Int64,
+                                                           nnz::Int64, index::onemklIndex,
+                                                           col_ptr::Ptr{Int64},
+                                                           row_ind::Ptr{Int64},
+                                                           values::Ptr{ComplexF32})::Cint
+end
+
+function onemklSsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind,
                                     col_ind, values)
     @ccall liboneapi_support.onemklSsparse_set_coo_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
+                                                        spmat::matrix_handle_t,
                                                         nrows::Int32, ncols::Int32,
                                                         nnz::Int32, index::onemklIndex,
                                                         row_ind::ZePtr{Int32},
@@ -6536,10 +6624,10 @@ function onemklSsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde
                                                         values::ZePtr{Cfloat})::Cint
 end
 
-function onemklSsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+function onemklSsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index,
                                        row_ind, col_ind, values)
     @ccall liboneapi_support.onemklSsparse_set_coo_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
                                                            nnz::Int64, index::onemklIndex,
                                                            row_ind::ZePtr{Int64},
@@ -6547,10 +6635,10 @@ function onemklSsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i
                                                            values::ZePtr{Cfloat})::Cint
 end
 
-function onemklDsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind,
+function onemklDsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind,
                                     col_ind, values)
     @ccall liboneapi_support.onemklDsparse_set_coo_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
+                                                        spmat::matrix_handle_t,
                                                         nrows::Int32, ncols::Int32,
                                                         nnz::Int32, index::onemklIndex,
                                                         row_ind::ZePtr{Int32},
@@ -6558,10 +6646,10 @@ function onemklDsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde
                                                         values::ZePtr{Cdouble})::Cint
 end
 
-function onemklDsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+function onemklDsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index,
                                        row_ind, col_ind, values)
     @ccall liboneapi_support.onemklDsparse_set_coo_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
                                                            nnz::Int64, index::onemklIndex,
                                                            row_ind::ZePtr{Int64},
@@ -6569,10 +6657,10 @@ function onemklDsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i
                                                            values::ZePtr{Cdouble})::Cint
 end
 
-function onemklCsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind,
+function onemklCsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind,
                                     col_ind, values)
     @ccall liboneapi_support.onemklCsparse_set_coo_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
+                                                        spmat::matrix_handle_t,
                                                         nrows::Int32, ncols::Int32,
                                                         nnz::Int32, index::onemklIndex,
                                                         row_ind::ZePtr{Int32},
@@ -6580,10 +6668,10 @@ function onemklCsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde
                                                         values::ZePtr{ComplexF32})::Cint
 end
 
-function onemklCsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+function onemklCsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index,
                                        row_ind, col_ind, values)
     @ccall liboneapi_support.onemklCsparse_set_coo_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
                                                            nnz::Int64, index::onemklIndex,
                                                            row_ind::ZePtr{Int64},
@@ -6591,10 +6679,10 @@ function onemklCsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i
                                                            values::ZePtr{ComplexF32})::Cint
 end
 
-function onemklZsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind,
+function onemklZsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind,
                                     col_ind, values)
     @ccall liboneapi_support.onemklZsparse_set_coo_data(device_queue::syclQueue_t,
-                                                        spMat::matrix_handle_t,
+                                                        spmat::matrix_handle_t,
                                                         nrows::Int32, ncols::Int32,
                                                         nnz::Int32, index::onemklIndex,
                                                         row_ind::ZePtr{Int32},
@@ -6602,10 +6690,10 @@ function onemklZsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde
                                                         values::ZePtr{ComplexF64})::Cint
 end
 
-function onemklZsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index,
+function onemklZsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index,
                                        row_ind, col_ind, values)
     @ccall liboneapi_support.onemklZsparse_set_coo_data_64(device_queue::syclQueue_t,
-                                                           spMat::matrix_handle_t,
+                                                           spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
                                                            nnz::Int64, index::onemklIndex,
                                                            row_ind::ZePtr{Int64},
@@ -6613,6 +6701,130 @@ function onemklZsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i
                                                            values::ZePtr{ComplexF64})::Cint
 end
 
+function onemklSsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                    row_blk_size, col_blk_size, blk_layout, index,
+                                    bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklSsparse_set_bsr_data(device_queue::syclQueue_t,
+                                                        spmat::matrix_handle_t,
+                                                        blk_nrows::Int64, blk_ncols::Int64,
+                                                        blk_nnz::Int64, row_blk_size::Int64,
+                                                        col_blk_size::Int64,
+                                                        blk_layout::onemklLayout,
+                                                        index::onemklIndex,
+                                                        bsr_row_ptr::Ptr{Int32},
+                                                        bsr_col_ind::Ptr{Int32},
+                                                        bsr_values::Ptr{Cfloat})::Cint
+end
+
+function onemklSsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                       row_blk_size, col_blk_size, blk_layout, index,
+                                       bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklSsparse_set_bsr_data_64(device_queue::syclQueue_t,
+                                                           spmat::matrix_handle_t,
+                                                           blk_nrows::Int64,
+                                                           blk_ncols::Int64, blk_nnz::Int64,
+                                                           row_blk_size::Int64,
+                                                           col_blk_size::Int64,
+                                                           blk_layout::onemklLayout,
+                                                           index::onemklIndex,
+                                                           bsr_row_ptr::Ptr{Int64},
+                                                           bsr_col_ind::Ptr{Int64},
+                                                           bsr_values::Ptr{Cfloat})::Cint
+end
+
+function onemklDsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                    row_blk_size, col_blk_size, blk_layout, index,
+                                    bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklDsparse_set_bsr_data(device_queue::syclQueue_t,
+                                                        spmat::matrix_handle_t,
+                                                        blk_nrows::Int64, blk_ncols::Int64,
+                                                        blk_nnz::Int64, row_blk_size::Int64,
+                                                        col_blk_size::Int64,
+                                                        blk_layout::onemklLayout,
+                                                        index::onemklIndex,
+                                                        bsr_row_ptr::Ptr{Int32},
+                                                        bsr_col_ind::Ptr{Int32},
+                                                        bsr_values::Ptr{Cdouble})::Cint
+end
+
+function onemklDsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                       row_blk_size, col_blk_size, blk_layout, index,
+                                       bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklDsparse_set_bsr_data_64(device_queue::syclQueue_t,
+                                                           spmat::matrix_handle_t,
+                                                           blk_nrows::Int64,
+                                                           blk_ncols::Int64, blk_nnz::Int64,
+                                                           row_blk_size::Int64,
+                                                           col_blk_size::Int64,
+                                                           blk_layout::onemklLayout,
+                                                           index::onemklIndex,
+                                                           bsr_row_ptr::Ptr{Int64},
+                                                           bsr_col_ind::Ptr{Int64},
+                                                           bsr_values::Ptr{Cdouble})::Cint
+end
+
+function onemklCsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                    row_blk_size, col_blk_size, blk_layout, index,
+                                    bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklCsparse_set_bsr_data(device_queue::syclQueue_t,
+                                                        spmat::matrix_handle_t,
+                                                        blk_nrows::Int64, blk_ncols::Int64,
+                                                        blk_nnz::Int64, row_blk_size::Int64,
+                                                        col_blk_size::Int64,
+                                                        blk_layout::onemklLayout,
+                                                        index::onemklIndex,
+                                                        bsr_row_ptr::Ptr{Int32},
+                                                        bsr_col_ind::Ptr{Int32},
+                                                        bsr_values::Ptr{ComplexF32})::Cint
+end
+
+function onemklCsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                       row_blk_size, col_blk_size, blk_layout, index,
+                                       bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklCsparse_set_bsr_data_64(device_queue::syclQueue_t,
+                                                           spmat::matrix_handle_t,
+                                                           blk_nrows::Int64,
+                                                           blk_ncols::Int64, blk_nnz::Int64,
+                                                           row_blk_size::Int64,
+                                                           col_blk_size::Int64,
+                                                           blk_layout::onemklLayout,
+                                                           index::onemklIndex,
+                                                           bsr_row_ptr::Ptr{Int64},
+                                                           bsr_col_ind::Ptr{Int64},
+                                                           bsr_values::Ptr{ComplexF32})::Cint
+end
+
+function onemklZsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                    row_blk_size, col_blk_size, blk_layout, index,
+                                    bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklZsparse_set_bsr_data(device_queue::syclQueue_t,
+                                                        spmat::matrix_handle_t,
+                                                        blk_nrows::Int64, blk_ncols::Int64,
+                                                        blk_nnz::Int64, row_blk_size::Int64,
+                                                        col_blk_size::Int64,
+                                                        blk_layout::onemklLayout,
+                                                        index::onemklIndex,
+                                                        bsr_row_ptr::Ptr{Int32},
+                                                        bsr_col_ind::Ptr{Int32},
+                                                        bsr_values::Ptr{ComplexF32})::Cint
+end
+
+function onemklZsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz,
+                                       row_blk_size, col_blk_size, blk_layout, index,
+                                       bsr_row_ptr, bsr_col_ind, bsr_values)
+    @ccall liboneapi_support.onemklZsparse_set_bsr_data_64(device_queue::syclQueue_t,
+                                                           spmat::matrix_handle_t,
+                                                           blk_nrows::Int64,
+                                                           blk_ncols::Int64, blk_nnz::Int64,
+                                                           row_blk_size::Int64,
+                                                           col_blk_size::Int64,
+                                                           blk_layout::onemklLayout,
+                                                           index::onemklIndex,
+                                                           bsr_row_ptr::Ptr{Int64},
+                                                           bsr_col_ind::Ptr{Int64},
+                                                           bsr_values::Ptr{ComplexF32})::Cint
+end
+
 function onemklXsparse_init_matmat_descr(p_desc)
     @ccall liboneapi_support.onemklXsparse_init_matmat_descr(p_desc::Ptr{matmat_descr_t})::Cint
 end
diff --git a/res/Project.toml b/res/Project.toml
index f7b4a7fe..20a2ab05 100644
--- a/res/Project.toml
+++ b/res/Project.toml
@@ -1,4 +1,4 @@
 [deps]
 Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
-oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
+oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314"
diff --git a/res/wrap.jl b/res/wrap.jl
index 1d48315e..a86428d2 100644
--- a/res/wrap.jl
+++ b/res/wrap.jl
@@ -108,7 +108,7 @@ end
 # Main application
 #
 
-using oneAPI_Level_Zero_Headers_jll
+import oneAPI_Level_Zero_Headers_LTS_jll as oneAPI_Level_Zero_Headers_jll
 
 function main()
     wrap("ze", oneAPI_Level_Zero_Headers_jll.ze_api)
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 33afe596..6b65d744 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -148,7 +148,7 @@ function __init__()
 end
 
 function set_debug!(debug::Bool)
-    for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_jll]
+    for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_LTS_jll]
         Preferences.set_preferences!(jll, "debug" => string(debug); force=true)
     end
     @info "oneAPI debug mode $(debug ? "enabled" : "disabled"); please re-start Julia."
diff --git a/src/utils.jl b/src/utils.jl
index 0516de03..e7d232ec 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -2,7 +2,7 @@
 function versioninfo(io::IO=stdout)
     if Sys.islinux()
         println(io, "Binary dependencies:")
-        for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_jll, oneL0.NEO_jll.gmmlib_jll,
+        for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_LTS_jll, oneL0.NEO_jll.gmmlib_jll,
                     SPIRV_LLVM_Translator_jll, SPIRV_Tools_jll, oneAPI_Support_jll]
             name = string(jll)
             print(io, "- $(name[1:end-4]): $(Base.pkgversion(jll))")
diff --git a/test/Project.toml b/test/Project.toml
index 1e877a7e..fa5557e9 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -9,7 +9,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-NEO_jll = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
+NEO_LTS_jll = "a724f90f-ce79-56dd-a1bd-b9de5a61085f"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
@@ -19,7 +19,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-libigc_jll = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
+libigc_LTS_jll = "9a8258a1-e827-5686-bee9-144461246960"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
 

From d427139d6a9dcd9c97339e61b44c37caacd4da9b Mon Sep 17 00:00:00 2001
From: Michel Schanen <michel.schanen@gmail.com>
Date: Mon, 11 May 2026 11:56:39 -0500
Subject: [PATCH 02/12] NEO link fix

---
 src/oneAPI.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 6b65d744..5e6b349f 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -138,6 +138,15 @@ function __init__()
         else
             # ensure that the OpenCL loader finds the ICD files from our artifacts
             ENV["OCL_ICD_FILENAMES"] = oneL0.NEO_jll.libigdrcl
+
+            # ensure that libsycl's bundled ze_lib finds NEO's libze_intel_gpu via
+            # path-based driver discovery (it does not reuse the JLL-loaded module).
+            # Required when no system NEO is installed.
+            neo_libdir = dirname(oneL0.NEO_jll.libze_intel_gpu)
+            ld = get(ENV, "LD_LIBRARY_PATH", "")
+            if !occursin(neo_libdir, ld)
+                ENV["LD_LIBRARY_PATH"] = isempty(ld) ? neo_libdir : "$neo_libdir:$ld"
+            end
         end
 
         # XXX: work around an issue with SYCL/Level Zero interoperability

From 41bd558aab18ea78c6914330a9295e0236e4815c Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 5 Jun 2026 17:17:40 +0000
Subject: [PATCH 03/12] Fix oneMKL sparse CSR bindings for the LTS stack

---
 lib/mkl/wrappers_sparse.jl       |  4 +--
 lib/support/liboneapi_support.jl | 48 ++++++++++++++++----------------
 res/support.toml                 | 10 ++++---
 3 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl
index 8e58956b..92cd0d43 100644
--- a/lib/mkl/wrappers_sparse.jl
+++ b/lib/mkl/wrappers_sparse.jl
@@ -60,7 +60,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
             queue = global_queue(context(nzVal), device(nzVal))
             # Don't update handle if matrix is empty
             if m != 0 && n != 0
-                $fname(sycl_queue(queue), handle_ptr[], m, n, 'O', rowPtr, colVal, nzVal)
+                $fname(sycl_queue(queue), handle_ptr[], m, n, nnzA, 'O', rowPtr, colVal, nzVal)
                 dA = oneSparseMatrixCSR{$elty, $intty}(handle_ptr[], rowPtr, colVal, nzVal, (m, n), nnzA)
                 finalizer(sparse_release_matrix_handle, dA)
             else
@@ -81,7 +81,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data   , :Float32   , :Int3
             nnzA = length(nzVal)
             # Don't update handle if matrix is empty
             if m != 0 && n != 0
-                $fname(sycl_queue(queue), handle_ptr[], n, m, 'O', colPtr, rowVal, nzVal)  # CSC of A is CSR of Aᵀ
+                $fname(sycl_queue(queue), handle_ptr[], n, m, nnzA, 'O', colPtr, rowVal, nzVal)  # CSC of A is CSR of Aᵀ
                 dA = oneSparseMatrixCSC{$elty, $intty}(handle_ptr[], colPtr, rowVal, nzVal, (m, n), nnzA)
                 finalizer(sparse_release_matrix_handle, dA)
             else
diff --git a/lib/support/liboneapi_support.jl b/lib/support/liboneapi_support.jl
index 5f30a9f1..e1b7327f 100644
--- a/lib/support/liboneapi_support.jl
+++ b/lib/support/liboneapi_support.jl
@@ -6442,10 +6442,10 @@ function onemklSsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde
     @ccall liboneapi_support.onemklSsparse_set_csr_data(device_queue::syclQueue_t,
                                                         spmat::matrix_handle_t,
                                                         nrows::Int64, ncols::Int64,
-                                                        nnz::Int64, index::ZePtr{Int32},
+                                                        nnz::Int64, index::onemklIndex,
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Cfloat},
-                                                        values::Ptr{Cfloat})::Cint
+                                                        col_ind::ZePtr{Int32},
+                                                        values::ZePtr{Cfloat})::Cint
 end
 
 function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
@@ -6453,10 +6453,10 @@ function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i
     @ccall liboneapi_support.onemklSsparse_set_csr_data_64(device_queue::syclQueue_t,
                                                            spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           nnz::Int64, index::ZePtr{Int64},
+                                                           nnz::Int64, index::onemklIndex,
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Cfloat},
-                                                           values::Ptr{Cfloat})::Cint
+                                                           col_ind::ZePtr{Int64},
+                                                           values::ZePtr{Cfloat})::Cint
 end
 
 function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
@@ -6464,10 +6464,10 @@ function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde
     @ccall liboneapi_support.onemklDsparse_set_csr_data(device_queue::syclQueue_t,
                                                         spmat::matrix_handle_t,
                                                         nrows::Int64, ncols::Int64,
-                                                        nnz::Int64, index::ZePtr{Int32},
+                                                        nnz::Int64, index::onemklIndex,
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{Cdouble},
-                                                        values::Ptr{Cdouble})::Cint
+                                                        col_ind::ZePtr{Int32},
+                                                        values::ZePtr{Cdouble})::Cint
 end
 
 function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
@@ -6475,10 +6475,10 @@ function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i
     @ccall liboneapi_support.onemklDsparse_set_csr_data_64(device_queue::syclQueue_t,
                                                            spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           nnz::Int64, index::ZePtr{Int64},
+                                                           nnz::Int64, index::onemklIndex,
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{Cdouble},
-                                                           values::Ptr{Cdouble})::Cint
+                                                           col_ind::ZePtr{Int64},
+                                                           values::ZePtr{Cdouble})::Cint
 end
 
 function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
@@ -6486,10 +6486,10 @@ function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde
     @ccall liboneapi_support.onemklCsparse_set_csr_data(device_queue::syclQueue_t,
                                                         spmat::matrix_handle_t,
                                                         nrows::Int64, ncols::Int64,
-                                                        nnz::Int64, index::ZePtr{Int32},
+                                                        nnz::Int64, index::onemklIndex,
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{ComplexF32},
-                                                        values::Ptr{ComplexF32})::Cint
+                                                        col_ind::ZePtr{Int32},
+                                                        values::ZePtr{ComplexF32})::Cint
 end
 
 function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
@@ -6497,10 +6497,10 @@ function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i
     @ccall liboneapi_support.onemklCsparse_set_csr_data_64(device_queue::syclQueue_t,
                                                            spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           nnz::Int64, index::ZePtr{Int64},
+                                                           nnz::Int64, index::onemklIndex,
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{ComplexF32},
-                                                           values::Ptr{ComplexF32})::Cint
+                                                           col_ind::ZePtr{Int64},
+                                                           values::ZePtr{ComplexF32})::Cint
 end
 
 function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr,
@@ -6508,10 +6508,10 @@ function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde
     @ccall liboneapi_support.onemklZsparse_set_csr_data(device_queue::syclQueue_t,
                                                         spmat::matrix_handle_t,
                                                         nrows::Int64, ncols::Int64,
-                                                        nnz::Int64, index::ZePtr{Int32},
+                                                        nnz::Int64, index::onemklIndex,
                                                         row_ptr::ZePtr{Int32},
-                                                        col_ind::ZePtr{ComplexF64},
-                                                        values::Ptr{ComplexF32})::Cint
+                                                        col_ind::ZePtr{Int32},
+                                                        values::ZePtr{ComplexF64})::Cint
 end
 
 function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index,
@@ -6519,10 +6519,10 @@ function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i
     @ccall liboneapi_support.onemklZsparse_set_csr_data_64(device_queue::syclQueue_t,
                                                            spmat::matrix_handle_t,
                                                            nrows::Int64, ncols::Int64,
-                                                           nnz::Int64, index::ZePtr{Int64},
+                                                           nnz::Int64, index::onemklIndex,
                                                            row_ptr::ZePtr{Int64},
-                                                           col_ind::ZePtr{ComplexF64},
-                                                           values::Ptr{ComplexF32})::Cint
+                                                           col_ind::ZePtr{Int64},
+                                                           values::ZePtr{ComplexF64})::Cint
 end
 
 function onemklSsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr,
diff --git a/res/support.toml b/res/support.toml
index 230ac406..ac641b90 100644
--- a/res/support.toml
+++ b/res/support.toml
@@ -370,15 +370,17 @@ use_ccall_macro = true
 6 = "ZePtr{Float32}"
 8 = "Ref{Float32}"
 
+# Argument positions account for the `nnz` parameter (arg 5) and the
+# `index::onemklIndex` enum (arg 6); the device pointers are args 7-9.
 [api.onemklXsparse_set_csr_data.argtypes]
-6 = "ZePtr{Int32}"
 7 = "ZePtr{Int32}"
-8 = "ZePtr{T}"
+8 = "ZePtr{Int32}"
+9 = "ZePtr{T}"
 
 [api.onemklXsparse_set_csr_data_64.argtypes]
-6 = "ZePtr{Int64}"
 7 = "ZePtr{Int64}"
-8 = "ZePtr{T}"
+8 = "ZePtr{Int64}"
+9 = "ZePtr{T}"
 
 [api.onemklXsparse_set_coo_data.argtypes]
 7 = "ZePtr{Int32}"

From c6d068cc986a715f1e050cac2c95ead88b978f99 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Tue, 9 Jun 2026 16:02:43 +0000
Subject: [PATCH 04/12] Fix silent corruption in strided mapreducedim! on the
 LTS stack

---
 src/mapreduce.jl | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 822b9b16..28725c84 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -105,6 +105,32 @@ function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R,
     return
 end
 
+# Coalesced reduction for when the contiguous leading dimension is NOT reduced (the reduced
+# axes are strided). One work-item per output slice (Rother element), grid-strided; each
+# serially reduces over Rreduce. Consecutive work-items map to consecutive output slices,
+# which are consecutive in memory, so global reads are coalesced across lanes — the access
+# pattern a `dims=1` reduction already uses. On the Aurora LTS stack the workgroup-per-slice
+# kernel above reads a *strided* reduced dimension non-coalesced, which silently corrupts
+# large reductions (e.g. `sum(A; dims=2)`); this path avoids that pattern entirely.
+function coalesced_mapreduce_device(f, op, neutral, Rreduce, Rother, R, As...)
+    iother = (get_group_id() - 1) * get_local_size() + get_local_id()
+    gstride = get_num_groups() * get_local_size()
+    @inbounds while iother <= length(Rother)
+        Iother = Rother[iother]
+        Iout = CartesianIndex(Tuple(Iother)..., 1)
+        neut = neutral === nothing ? R[Iout] : neutral
+        val = op(neut, neut)
+        for ireduce in 1:length(Rreduce)
+            Ireduce = Rreduce[ireduce]
+            J = max(Iother, Ireduce)
+            val = op(val, f(_map_getindex(As, J)...))
+        end
+        R[Iout] = val
+        iother += gstride
+    end
+    return
+end
+
 ## COV_EXCL_STOP
 
 function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
@@ -133,6 +159,20 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
     # but allows us to write a generalized kernel supporting partial reductions.
     R′ = reshape(R, (size(R)..., 1))
 
+    # Aurora LTS workaround: the workgroup-per-slice kernel below reads a *strided* reduced
+    # dimension non-coalesced, which silently corrupts reductions on this stack (regardless of
+    # output count — it depends on the reduction length, not the number of slices). Whenever
+    # the contiguous leading dimension is NOT reduced (`size(Rreduce, 1) == 1`), use the
+    # coalesced one-work-item-per-slice kernel, whose lanes read consecutive memory. Few-slice
+    # reductions get less parallelism but stay correct; the common many-slice case is also fast.
+    if size(Rreduce, 1) == 1
+        items = clamp(length(Rother), 1, 256)
+        groups = min(cld(length(Rother), items), 1024)
+        @oneapi items=items groups=groups coalesced_mapreduce_device(
+            f, op, init, Rreduce, Rother, R′, A)
+        return R
+    end
+
     # how many items do we want?
     #
     # items in a group work together to reduce values across the reduction dimensions;

From 34545ec8f7cd372f5ca0397349b5e0331cd9d3c2 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Wed, 10 Jun 2026 19:11:51 +0000
Subject: [PATCH 05/12] Synchronize queues before freeing buffers on the LTS
 stack

---
 src/context.jl | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/pool.jl    |  9 +++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/src/context.jl b/src/context.jl
index 88035e74..14f1e593 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -224,8 +224,57 @@ function global_queue(ctx::ZeContext, dev::ZeDevice)
     # NOTE: dev purposefully does not default to context() or device() to stress that
     #       objects should track ownership, and not rely on implicit global state.
     get!(task_local_storage(), (:ZeCommandQueue, ctx, dev)) do
-        ZeCommandQueue(ctx, dev; flags = oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER)
+        queue = ZeCommandQueue(ctx, dev; flags = oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER)
+        # disable finalizers while mutating the registry: a GC-driven finalizer on this
+        # task could call back into `synchronize_all_queues` (the lock is reentrant) and
+        # observe/mutate the registry mid-update.
+        GC.enable_finalizers(false)
+        try
+            @lock queue_registry_lock begin
+                push!(get!(Vector{WeakRef}, queue_registry, (ctx, dev)), WeakRef(queue))
+            end
+        finally
+            GC.enable_finalizers(true)
+        end
+        queue
+    end
+end
+
+# Registry of all queues created through `global_queue`, across tasks. Buffers can be
+# freed from any task (GC finalizers), so `release` needs to be able to find the queues
+# that may still have work in flight referencing the buffer; queues themselves are
+# cached task-locally and would otherwise be unreachable from the finalizing task.
+const queue_registry_lock = ReentrantLock()
+const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{WeakRef}}()
+
+# synchronize all known queues that target the given context (and device, if specified),
+# i.e., all queues whose in-flight work could possibly reference an allocation that is
+# about to be freed.
+function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing})
+    queues = ZeCommandQueue[]
+    GC.enable_finalizers(false)
+    try
+        @lock queue_registry_lock begin
+            for ((qctx, qdev), refs) in queue_registry
+                qctx == ctx || continue
+                (dev === nothing || qdev == dev) || continue
+                filter!(refs) do ref
+                    queue = ref.value
+                    queue === nothing && return false
+                    push!(queues, queue)
+                    true
+                end
+            end
+        end
+    finally
+        GC.enable_finalizers(true)
+    end
+    # synchronize outside the lock: this can block for as long as a kernel runs,
+    # and finalizers running concurrently also need to take the lock.
+    for queue in queues
+        oneL0.synchronize(queue)
     end
+    return
 end
 
 """
diff --git a/src/pool.jl b/src/pool.jl
index 165a7f07..e541d26e 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -77,6 +77,15 @@ function release(buf::oneL0.AbstractBuffer)
     #    evict(ctx, dev, buf)
     #end
 
+    # NEO (at least the 25.18 LTS release) does not honor the BLOCKING_FREE/DEFER_FREE
+    # policies of zeMemFreeExt: it advertises ZE_extension_memory_free_policies but
+    # unmaps the allocation immediately, even with work in flight that references it.
+    # That turns a GC-driven free of a dead array whose last kernel/copy hasn't retired
+    # into a GPU pagefault, which gets the kernel context banned and makes every later
+    # submission fail with ZE_RESULT_ERROR_UNKNOWN. Synchronize the queues that could
+    # reference this buffer before freeing.
+    synchronize_all_queues(oneL0.context(buf), oneL0.device(buf))
+
     free(buf; policy=oneL0.ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE)
 
     # TODO: queue-ordered free from non-finalizer tasks once we have

From be62729ad47134d0e6bf457f755b63f961df72a7 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Wed, 10 Jun 2026 20:15:13 +0000
Subject: [PATCH 06/12] Fix GC rooting and SYCL queue lifetime in the oneMKL
 FFT wrappers

plan_fft and friends passed `pointer(lengths)`/`pointer(strides)` to the
onemklDft* ccall wrappers. A raw Ptr does not root the vector, so under GC
pressure it was collected mid-call and oneMKL read garbage through the
dangling pointer, making multi-dimensional plan commits fail
nondeterministically (invalid_descriptor_exception or SIGFPE, surfacing
as DivideError in the fft testset). Pass the arrays themselves so the
ccall roots them.

Also create the plan's SYCL queue through the cached task-local
sycl_queue() accessor like the other oneMKL wrappers, instead of fresh
syclContext/syclQueue objects per plan whose finalizers tear down SYCL
runtime state for the still-in-use underlying queue.

Harden the free-path queue synchronization against finalized queues:
ZeCommandQueue now nulls its handle on destroy, and
synchronize_all_queues skips destroyed queues and keeps finalizers
disabled until it is done synchronizing.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 lib/level-zero/cmdqueue.jl |  4 ++++
 lib/mkl/fft.jl             | 32 +++++++++++++++++++-------------
 src/context.jl             | 13 ++++++++-----
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/lib/level-zero/cmdqueue.jl b/lib/level-zero/cmdqueue.jl
index 3004a145..e123038a 100644
--- a/lib/level-zero/cmdqueue.jl
+++ b/lib/level-zero/cmdqueue.jl
@@ -21,6 +21,10 @@ mutable struct ZeCommandQueue
         obj = new(handle_ref[], ctx, dev, ordinal)
         finalizer(obj) do obj
             zeCommandQueueDestroy(obj)
+            # mark the queue as destroyed: it can still be weakly reachable (e.g. from
+            # the queue registry used by `synchronize_all_queues`), and synchronizing a
+            # destroyed handle crashes in the driver.
+            obj.handle = ze_command_queue_handle_t(C_NULL)
         end
         obj
     end
diff --git a/lib/mkl/fft.jl b/lib/mkl/fft.jl
index 5f5614b1..0ccdca46 100644
--- a/lib/mkl/fft.jl
+++ b/lib/mkl/fft.jl
@@ -95,16 +95,22 @@ function _create_descriptor(sz::NTuple{N,Int}, T::Type, complex::Bool) where {N}
     desc_ref = Ref{onemklDftDescriptor_t}()
     # Create descriptor for the full array dimensions
     lengths = collect(Int64, sz)
-    st = length(lengths) == 1 ? onemklDftCreate1D(desc_ref, prec, dom, lengths[1]) : onemklDftCreateND(desc_ref, prec, dom, length(lengths), pointer(lengths))
+    # NB: pass the arrays themselves to the ccall wrappers (NOT `pointer(...)`) so they
+    # are rooted for the duration of the call; passing a raw Ptr lets the GC collect the
+    # vector mid-call, which made MKL read garbage lengths/strides and fail `commit` with
+    # invalid_descriptor_exception or a SIGFPE, depending on heap reuse.
+    st = length(lengths) == 1 ? onemklDftCreate1D(desc_ref, prec, dom, lengths[1]) : onemklDftCreateND(desc_ref, prec, dom, length(lengths), lengths)
     st == 0 || error("onemkl DFT create failed (status $st)")
     desc = desc_ref[]
     # Do not program descriptor scaling; we'll perform inverse normalization manually.
     # Set placement explicitly based on plan type later
-    # Construct a SYCL queue from current Level Zero context/device (reuse global queue)
+    # Use the task-local cached SYCL queue wrapping the global Level Zero queue, like
+    # the other oneMKL wrappers do. Creating fresh syclContext/syclQueue objects per
+    # plan is unsound: once they become garbage their finalizers (syclQueueDestroy etc.)
+    # tear down SYCL runtime state for the still-in-use underlying queue, corrupting
+    # later DFT commits (SIGFPE) and crashing at process exit.
     ze_ctx = oneAPI.context(); ze_dev = oneAPI.device()
-    sycl_dev = SYCL.syclDevice(SYCL.syclPlatform(oneAPI.driver()), ze_dev)
-    sycl_ctx = SYCL.syclContext([sycl_dev], ze_ctx)
-    q = SYCL.syclQueue(sycl_ctx, sycl_dev, oneAPI.global_queue(ze_ctx, ze_dev))
+    q = oneAPI.sycl_queue(oneAPI.global_queue(ze_ctx, ze_dev))
     return desc, q
 end
 
@@ -125,8 +131,8 @@ function plan_fft(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,Co
             strides[i+1] = prod
             prod *= size(X,i)
         end
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides))
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides))
     end
     stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)")
     return cMKLFFTPlan{T,MKLFFT_FORWARD,false,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing)
@@ -144,8 +150,8 @@ function plan_bfft(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,C
         @inbounds for i in 1:N
             strides[i+1]=prod; prod*=size(X,i)
         end
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides))
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides))
     end
     stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)")
     return cMKLFFTPlan{T,MKLFFT_INVERSE,false,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing)
@@ -165,8 +171,8 @@ function plan_fft!(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,C
         @inbounds for i in 1:N
             strides[i+1]=prod; prod*=size(X,i)
         end
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides))
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides))
     end
     stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)")
     cMKLFFTPlan{T,MKLFFT_FORWARD,true,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing)
@@ -184,8 +190,8 @@ function plan_bfft!(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,
         @inbounds for i in 1:N
             strides[i+1]=prod; prod*=size(X,i)
         end
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides))
-        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides))
+        onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides))
     end
     stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)")
     cMKLFFTPlan{T,MKLFFT_INVERSE,true,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing)
diff --git a/src/context.jl b/src/context.jl
index 14f1e593..f340c4fa 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -261,19 +261,22 @@ function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing})
                 filter!(refs) do ref
                     queue = ref.value
                     queue === nothing && return false
+                    queue.handle == C_NULL && return false  # finalized, handle destroyed
                     push!(queues, queue)
                     true
                 end
             end
         end
+        # synchronize outside the lock: this can block for as long as a kernel runs,
+        # and finalizers running concurrently also need to take the lock. Keep
+        # finalizers disabled so none of the collected queues can be destroyed
+        # between collection and synchronization.
+        for queue in queues
+            oneL0.synchronize(queue)
+        end
     finally
         GC.enable_finalizers(true)
     end
-    # synchronize outside the lock: this can block for as long as a kernel runs,
-    # and finalizers running concurrently also need to take the lock.
-    for queue in queues
-        oneL0.synchronize(queue)
-    end
     return
 end
 

From 561628575ca3ea8b6c17c20d82d2a1047f7958a1 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 12 Jun 2026 13:46:11 +0000
Subject: [PATCH 07/12] Make host USM resident and harden queue lifetime on the
 LTS stack

Two distinct GPU pagefault crashes on the Aurora LTS NEO stack (25.18), both
surfacing as a banned context -> ZE_RESULT_ERROR_UNKNOWN at an innocent later op:

1. Host-buffer residency (the array-test crash). HostBuffer-backed oneArrays were
   never made resident on the device, unlike DeviceBuffer/SharedBuffer. A kernel
   reading a non-resident host buffer intermittently takes a NotPresent pagefault
   under GC/alloc churn. Isolated with a minimal read/noread/readsync loop: the
   fault is at the GPU *read*, not the free, and is not curable by any
   synchronization. Fix: make_resident in allocate(::HostBuffer) (src/pool.jl) and
   for fill!'s host pattern buffer (src/array.jl).

2. Device-buffer free under a dead task's queue. global_queue is task-local, so a
   test file's task can die with work still in flight; the queue was destroyed
   without draining and a WeakRef registry hid it from the pre-free synchronize,
   so a later free raced the in-flight kernel. Fix: queue finalizer drains before
   destroy (lib/level-zero/cmdqueue.jl); the queue registry holds strong queue
   refs keyed by a weak owning-task ref and retires drained dead-task queues
   (src/context.jl).

Validated with single-process reproducers and the full array test file (30/30).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/level-zero/cmdqueue.jl |  6 ++++++
 src/array.jl               |  7 +++++--
 src/context.jl             | 40 ++++++++++++++++++++++++++++++--------
 src/pool.jl                | 10 +++++++++-
 4 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/lib/level-zero/cmdqueue.jl b/lib/level-zero/cmdqueue.jl
index e123038a..bdefeb44 100644
--- a/lib/level-zero/cmdqueue.jl
+++ b/lib/level-zero/cmdqueue.jl
@@ -20,6 +20,12 @@ mutable struct ZeCommandQueue
         zeCommandQueueCreate(ctx, dev, desc_ref, handle_ref)
         obj = new(handle_ref[], ctx, dev, ordinal)
         finalizer(obj) do obj
+            # the queue may still have work in flight (nothing requires a task to
+            # synchronize before dying), and zeCommandQueueDestroy does not wait for
+            # it: on the LTS NEO stack the still-running work then faults as soon as
+            # a referenced allocation is freed, getting the context banned. drain the
+            # queue first; unchecked, as sync on a banned context returns an error.
+            unchecked_zeCommandQueueSynchronize(obj, typemax(UInt64))
             zeCommandQueueDestroy(obj)
             # mark the queue as destroyed: it can still be weakly reachable (e.g. from
             # the queue registry used by `synchronize_all_queues`), and synchronizing a
diff --git a/src/array.jl b/src/array.jl
index 3a8ffddb..0e79dde5 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -526,9 +526,12 @@ function Base.fill!(A::oneDenseArray{T}, val) where T
   val = convert(T, val)
   sizeof(T) == 0 && return A
 
-  # execute! is async, so we need to allocate the pattern in USM memory
-  # and keep it alive until the operation completes.
+  # execute! is async, so we need to allocate the pattern in USM memory and keep it alive
+  # until the operation completes. The fill reads this host buffer on the GPU, so it must
+  # be made resident on the device — a non-resident host buffer read by a kernel can take
+  # a NotPresent pagefault on the LTS NEO stack (see `allocate(::HostBuffer, ...)`).
   buf = oneL0.host_alloc(context(A), sizeof(T), Base.datatype_alignment(T))
+  oneL0.make_resident(context(A), device(), buf)
   unsafe_store!(convert(Ptr{T}, buf), val)
   unsafe_fill!(context(A), device(), pointer(A), convert(ZePtr{T}, buf), length(A))
   synchronize(global_queue(context(A), device()))
diff --git a/src/context.jl b/src/context.jl
index f340c4fa..9150b367 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -231,7 +231,8 @@ function global_queue(ctx::ZeContext, dev::ZeDevice)
         GC.enable_finalizers(false)
         try
             @lock queue_registry_lock begin
-                push!(get!(Vector{WeakRef}, queue_registry, (ctx, dev)), WeakRef(queue))
+                push!(get!(Vector{Tuple{WeakRef,ZeCommandQueue}}, queue_registry, (ctx, dev)),
+                      (WeakRef(current_task()), queue))
             end
         finally
             GC.enable_finalizers(true)
@@ -244,26 +245,37 @@ end
 # freed from any task (GC finalizers), so `release` needs to be able to find the queues
 # that may still have work in flight referencing the buffer; queues themselves are
 # cached task-locally and would otherwise be unreachable from the finalizing task.
+#
+# Entries reference the queue *strongly*: the GC clears WeakRefs to a dead queue in the
+# same cycle that queues its finalizer, i.e., before the finalizer runs, so a WeakRef
+# would hide the queue from `release` exactly when its in-flight work still references
+# buffers about to be freed. The owning task is tracked weakly instead: queues are
+# task-local, so once their task is dead no new work can reach them, and the entry can
+# be dropped (allowing the queue to be finalized) after a final synchronize.
 const queue_registry_lock = ReentrantLock()
-const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{WeakRef}}()
+const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{Tuple{WeakRef,ZeCommandQueue}}}()
 
 # synchronize all known queues that target the given context (and device, if specified),
 # i.e., all queues whose in-flight work could possibly reference an allocation that is
 # about to be freed.
 function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing})
     queues = ZeCommandQueue[]
+    stale = Tuple{WeakRef,ZeCommandQueue}[]
     GC.enable_finalizers(false)
     try
         @lock queue_registry_lock begin
-            for ((qctx, qdev), refs) in queue_registry
+            for ((qctx, qdev), entries) in queue_registry
                 qctx == ctx || continue
                 (dev === nothing || qdev == dev) || continue
-                filter!(refs) do ref
-                    queue = ref.value
-                    queue === nothing && return false
-                    queue.handle == C_NULL && return false  # finalized, handle destroyed
+                for entry in entries
+                    (task, queue) = entry
+                    queue.handle == C_NULL && continue  # finalized, handle destroyed
                     push!(queues, queue)
-                    true
+                    # entries whose task was already dead at this point cannot
+                    # receive new work, so they are safe to retire after the sync
+                    if task.value === nothing || istaskdone(task.value::Task)
+                        push!(stale, entry)
+                    end
                 end
             end
         end
@@ -274,6 +286,18 @@ function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing})
         for queue in queues
             oneL0.synchronize(queue)
         end
+        # retire drained queues of dead tasks, allowing them to be finalized (the
+        # finalizer synchronizes once more before destroying the queue, in case
+        # the queue is dropped through other means).
+        if !isempty(stale)
+            @lock queue_registry_lock begin
+                for ((qctx, qdev), entries) in queue_registry
+                    qctx == ctx || continue
+                    (dev === nothing || qdev == dev) || continue
+                    filter!(entry -> !any(s -> s === entry, stale), entries)
+                end
+            end
+        end
     finally
         GC.enable_finalizers(true)
     end
diff --git a/src/pool.jl b/src/pool.jl
index e541d26e..21f1a822 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -55,7 +55,15 @@ end
 
 function allocate(::Type{oneL0.HostBuffer}, ctx, dev, bytes::Int, alignment::Int)
     bytes == 0 && return oneL0.HostBuffer(ZE_NULL, bytes, ctx)
-    host_alloc(ctx, bytes, alignment)
+    buf = host_alloc(ctx, bytes, alignment)
+    # Host USM must be made resident on the device, exactly like device/shared
+    # allocations. On the Aurora LTS NEO stack (25.18), a GPU kernel that reads a
+    # non-resident host buffer intermittently takes a NotPresent pagefault (banning the
+    # context), even though host USM is nominally accessible — see `repro_host_minimal.jl`
+    # (a kernel reading a host-backed array under GC churn faults; the same pattern with
+    # device/shared buffers, which are already made resident, does not).
+    make_resident(ctx, dev, buf)
+    return buf
 end
 
 function release(buf::oneL0.AbstractBuffer)

From 3fca61160740902fe0f0b8c48250346c4a9079c4 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 12 Jun 2026 14:00:34 +0000
Subject: [PATCH 08/12] Add opt-in per-submission synchronize for the LTS
 dropped-tail workaround

Under heavy multi-process oversubscription of a single tile on the Aurora LTS NEO
stack, a whole-queue zeCommandQueueSynchronize does not reliably retire the tail of
an earlier separately-submitted command list, producing silent "dropped tail"
corruption (e.g. the gpuarrays/broadcasting `A .* ET(10)` mismatch; see
ISSUE_dropped_tail.md). Synchronizing after every submission eliminates it.

This is off by default (it costs ~3x throughput) and enabled with
ONEAPI_SYNC_EACH_SUBMISSION=1, at the single submission chokepoint
execute!(queue, lists, fence).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 lib/level-zero/cmdlist.jl | 17 +++++++++++++++--
 lib/level-zero/oneL0.jl   |  1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/lib/level-zero/cmdlist.jl b/lib/level-zero/cmdlist.jl
index 133d2551..24d1d3a5 100644
--- a/lib/level-zero/cmdlist.jl
+++ b/lib/level-zero/cmdlist.jl
@@ -47,8 +47,21 @@ function ZeCommandList(f::Base.Callable, args...; kwargs...)
     return list
 end
 
-execute!(queue::ZeCommandQueue, lists::Vector{ZeCommandList}, fence=nothing) =
-    zeCommandQueueExecuteCommandLists(queue, length(lists), lists, something(fence, C_NULL))
+# Opt-in workaround for the Aurora LTS NEO stack (set ONEAPI_SYNC_EACH_SUBMISSION=1).
+# Under heavy multi-process oversubscription of a single tile, a whole-queue
+# `zeCommandQueueSynchronize` does not reliably retire the tail of an earlier,
+# separately-submitted command list — producing silent "dropped tail" corruption (the
+# last work-item of a kernel / last element of a copy is missing). See
+# ISSUE_dropped_tail.md. Synchronizing after *every* submission eliminates it, at a large
+# throughput cost (~3x), so it is off by default and only enabled when correctness under
+# oversubscription matters more than speed.
+const sync_each_submission = Ref{Bool}(false)
+
+function execute!(queue::ZeCommandQueue, lists::Vector{ZeCommandList}, fence=nothing)
+    r = zeCommandQueueExecuteCommandLists(queue, length(lists), lists, something(fence, C_NULL))
+    sync_each_submission[] && synchronize(queue)
+    return r
+end
 
 """
     execute!(queue::ZeCommandQueue, ...) do list
diff --git a/lib/level-zero/oneL0.jl b/lib/level-zero/oneL0.jl
index 6bcb17fb..e80c8939 100644
--- a/lib/level-zero/oneL0.jl
+++ b/lib/level-zero/oneL0.jl
@@ -195,6 +195,7 @@ function __init__()
 
     validation_layer[] = parse(Bool, get(ENV, "ZE_ENABLE_VALIDATION_LAYER", "false"))
     parameter_validation[] = parse(Bool, get(ENV, "ZE_ENABLE_PARAMETER_VALIDATION", "false"))
+    sync_each_submission[] = lowercase(get(ENV, "ONEAPI_SYNC_EACH_SUBMISSION", "")) in ("1", "true", "yes")
 end
 
 end

From 991d29e0f3b19e5ecc9bb49c794d194d127782dd Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 12 Jun 2026 14:23:18 +0000
Subject: [PATCH 09/12] Add opt-in GPU spreading for the parallel test suite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ONEAPI_TEST_SPREAD_GPUS=1 pins each test worker process to a distinct GPU via
ZE_AFFINITY_MASK (claimed round-robin through an atomic mkdir counter, set before
`using oneAPI` so the Level Zero driver picks it up at init). This spreads the suite
across all tiles instead of oversubscribing device 0.

device() is task-local and Malt runs each test in a fresh task, so a device! in
init_worker_code would not stick — process-level pinning is the robust approach.

Default (unset) keeps every worker on the first device, preserving single-tile
oversubscription which is useful for surfacing contention bugs.

Verified: 6 concurrent claimers -> 6 distinct device UUIDs; real harness with
--jobs=4 spreads cleanly (SUCCESS).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/runtests.jl | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index b7015787..27b97776 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -24,7 +24,45 @@ end
 
 args = parse_args(ARGS)
 
+# Optional: spread test workers across all available GPUs, one worker per device
+# (round-robin), by pinning each worker *process* to a device with ZE_AFFINITY_MASK.
+# `device()` is task-local and Malt runs each test in a fresh task, so a `device!` in
+# `init_worker_code` would not stick — pinning the process via the driver is the robust
+# way to make every task on a worker use the same GPU.
+#
+# Enabled with ONEAPI_TEST_SPREAD_GPUS=1. When unset (the default) every worker stays on
+# the first device, which oversubscribes a single tile — useful for surfacing
+# contention/oversubscription bugs.
+const spread_gpus = lowercase(get(ENV, "ONEAPI_TEST_SPREAD_GPUS", "")) in ("1", "true", "yes")
+worker_env = Vector{Pair{String, String}}()
+device_claim_code = :()
+if spread_gpus
+    ndev = length(oneAPI.devices())
+    # shared, node-local directory used as an atomic round-robin counter (mkdir is atomic)
+    devdir = mktempdir(; prefix = "oneapi_test_gpus_")
+    push!(worker_env, "ONEAPI_TEST_DEVDIR" => devdir)
+    push!(worker_env, "ONEAPI_TEST_NDEV" => string(ndev))
+    @info "Spreading test workers across $ndev GPU(s) via ZE_AFFINITY_MASK (ONEAPI_TEST_SPREAD_GPUS=1)"
+    # NOTE: runs on the worker as the very first thing, before `using oneAPI` — so the
+    # Level Zero driver picks up ZE_AFFINITY_MASK at init and the process sees only its tile.
+    device_claim_code = quote
+        let dir = ENV["ONEAPI_TEST_DEVDIR"], ndev = parse(Int, ENV["ONEAPI_TEST_NDEV"])
+            i = 0
+            while true
+                try
+                    mkdir(joinpath(dir, string(i)))
+                    break
+                catch
+                    i += 1
+                end
+            end
+            ENV["ZE_AFFINITY_MASK"] = string(i % ndev)
+        end
+    end
+end
+
 init_worker_code = quote
+    $device_claim_code
     using oneAPI, Adapt
 
     import GPUArrays
@@ -105,4 +143,4 @@ init_code = quote
            ..@grab_output, ..@on_device, ..sink
 end
 
-runtests(oneAPI, args; testsuite, init_code, init_worker_code)
+runtests(oneAPI, args; testsuite, init_code, init_worker_code, env = worker_env)

From d78ef78fa52237952c5d005224bf93fc179f0db4 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Fri, 12 Jun 2026 14:43:25 +0000
Subject: [PATCH 10/12] Opt into sync each sub on GH self-runner

---
 .github/workflows/ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5bd87406..d4d9f8dd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,6 +15,12 @@ jobs:
     continue-on-error: true
     env:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      # Pin each parallel test worker to a distinct GPU tile instead of
+      # oversubscribing device 0 (see test/runtests.jl).
+      ONEAPI_TEST_SPREAD_GPUS: '1'
+      # Synchronize after every command-list submission to work around the
+      # Aurora LTS NEO dropped-tail corruption (see lib/level-zero/cmdlist.jl).
+      ONEAPI_SYNC_EACH_SUBMISSION: '1'
     runs-on: [self-hosted, linux, X64]
     strategy:
       matrix:

From 3229d8f5bc1352d90dacc0e02932895780a883ac Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Tue, 16 Jun 2026 15:10:12 +0000
Subject: [PATCH 11/12] Fix LTS-stack silent corruption in reductions and
 workgroup barriers

Two distinct correctness bugs surfaced as intermittent KernelAbstractions
example failures (naive_transpose, histogram) on the Aurora LTS NEO/IGC stack:

- mapreducedim! over a strided/transposed input (e.g. `a == transpose(b)`,
  `sum(transpose(x))`, `isequal`, `ishermitian`) hit the LTS IGC miscompile of
  non-coalesced global loads, silently corrupting the result. Detect
  non-contiguous inputs and materialize them to a dense array before reducing,
  so every global read is coalesced. (src/mapreduce.jl)

- KA `@synchronize` lowered to `barrier(0)`, emitting OpControlBarrier with
  SequentiallyConsistent but WITHOUT the WorkgroupMemory storage-class bit,
  which orders no memory per the SPIR-V spec. On the LTS stack shared-local
  writes were not made visible across the barrier, dropping updates (the
  histogram example's local-atomic accumulation lost counts). Fence local+global
  memory in `__synchronize`, and local memory in the mapreduce reduce_group SLM
  tree. (src/oneAPIKernels.jl, src/mapreduce.jl)

Validated on the LTS stack: kernelabstractions 2218 pass / 0 fail; gpuarrays
reductions+statistics 2962/2962; histogram 0 failures over 1100 iters; strided
==/sum/isequal/ishermitian correct.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/mapreduce.jl     | 27 ++++++++++++++++++++++++++-
 src/oneAPIKernels.jl | 10 +++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 28725c84..b2d28ef8 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -33,7 +33,12 @@
     # perform a reduction
     d = 1
     while d < items
-        barrier(0)
+        # LOCAL_MEM_FENCE: `barrier(0)` lowers to an OpControlBarrier without the
+        # WorkgroupMemory storage-class bit, which on the NEO/IGC LTS stack does not order the
+        # shared-local tree accesses across the barrier (proven by the histogram example's lost
+        # local-atomic updates). Fence local memory so each tree step sees the previous step's
+        # `shared[]` writes. See ISSUE_mapreduce_corruption.md.
+        barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE)
         index = 2 * d * (item-1) + 1
         @inbounds if index <= items
             other_val = if index + d <= items
@@ -133,12 +138,32 @@ end
 
 ## COV_EXCL_STOP
 
+# Aurora LTS workaround: the NEO/IGC LTS stack miscompiles *strided* (non-coalesced) global
+# reads inside the reduction kernel, silently corrupting results whenever an input is read
+# along a non-contiguous axis (e.g. `a == transpose(b)`, `sum(transpose(x))`, `ishermitian`).
+# Elementwise copies are NOT affected. `_dense_reduce_input` returns false for any input that
+# reads non-contiguous memory (a transposed/permuted/strided view, or a broadcast containing
+# one), so such inputs get materialized to a dense `oneArray` before reducing. See
+# ISSUE_mapreduce_corruption.md and the `naive_transpose` (`a == transpose(b)`) failure.
+@inline _dense_reduce_input(::oneArray) = true
+@inline _dense_reduce_input(x::Base.ReshapedArray) = _dense_reduce_input(parent(x))
+@inline _dense_reduce_input(::AbstractArray) = false   # Transpose/Adjoint/PermutedDims/SubArray/…
+@inline _dense_reduce_input(::Any) = true              # scalars/Refs/tuples carried in a broadcast
+@inline _dense_reduce_input(bc::Broadcast.Broadcasted) = all(_dense_reduce_input, bc.args)
+
 function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}
     Base.check_reducedims(R, A)
     length(A) == 0 && return R # isempty(::Broadcasted) iterates
 
+    # Aurora LTS workaround (see `_dense_reduce_input` above): materialize strided inputs to a
+    # dense array first so every global read in the reduction kernel is coalesced.
+    if !_dense_reduce_input(A)
+        Acontig = Broadcast.materialize(Broadcast.broadcasted(f, A))
+        return GPUArrays.mapreducedim!(identity, op, R, Acontig; init=init)
+    end
+
     # add singleton dimensions to the output container, if needed
     if ndims(R) < ndims(A)
         dims = Base.fill_to_length(size(R), 1, Val(ndims(A)))
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index 6e092397..138b4f8c 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -214,7 +214,15 @@ end
 ## Synchronization and Printing
 
 @device_override @inline function KA.__synchronize()
-    barrier(0)
+    # Fence both local and global memory across the workgroup barrier, matching CUDA
+    # `__syncthreads` semantics. `barrier(0)` lowers to `OpControlBarrier` with
+    # `SequentiallyConsistent` but WITHOUT the `WorkgroupMemory` storage-class bit, which the
+    # SPIR-V spec treats as ordering *no* memory — so on the NEO/IGC LTS stack shared-local
+    # writes (e.g. the histogram example's local-atomic accumulation) are not made visible
+    # before the next phase reads them, silently dropping updates. Passing
+    # `LOCAL_MEM_FENCE | GLOBAL_MEM_FENCE` ORs in the WorkgroupMemory/CrossWorkgroupMemory
+    # fence bits. See ISSUE_mapreduce_corruption.md (the barrier WorkgroupMemory-bit regression).
+    barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE | SPIRVIntrinsics.GLOBAL_MEM_FENCE)
 end
 
 @device_override @inline function KA.__print(args...)

From 32d11d71ef7faf9e895d784af6ed7b3a8e283771 Mon Sep 17 00:00:00 2001
From: Michel Schanen <mschanen@anl.gov>
Date: Tue, 16 Jun 2026 16:10:12 +0000
Subject: [PATCH 12/12] CI: disable AVX512-FP16 host codegen on the Aurora LTS
 self-runner

Under concurrent oneMKL load on the Aurora Sapphire Rapids nodes, Julia's native
AVX512-FP16 host codegen silently miscomputes Float16 (e.g. the GPUArrays
`A .* B .+ c` broadcast reference used by gpuarrays/broadcasting), failing tests
even though the GPU result is correct. It is single-process-clean, MXCSR-clean,
and only the native-FP16 path is affected (the Float32 path is immune).

Run the test suite with `-C native,-avx512fp16`, which routes Float16 through
Float32 and propagates to the Pkg.test subprocess and its parallel Malt workers
via Base.julia_cmd(). `julia-actions/julia-runtest` cannot pass a cpu-target, so
invoke `Pkg.test()` directly. A/B under the reproducer: native FP16 -> ~6% of
checks corrupt; -avx512fp16 -> 0 / 35000.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d4d9f8dd..d3c18c65 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -36,5 +36,13 @@ jobs:
       - uses: julia-actions/cache@v3
       - uses: julia-actions/julia-buildpkg@latest
         continue-on-error: true
-      - uses: julia-actions/julia-runtest@latest
+      # Disable AVX512-FP16 host codegen on the Aurora Sapphire Rapids nodes. Under concurrent
+      # oneMKL load the native AVX512-FP16 path silently miscomputes *host* Float16 (e.g. the
+      # GPUArrays `A .* B .+ c` broadcast reference), failing tests even though the GPU result
+      # is correct (single-process clean; MXCSR clean; only the native-FP16 path, not Float32).
+      # `-C native,-avx512fp16` routes Float16 through Float32 and propagates to the Pkg.test
+      # subprocess and its parallel workers via Base.julia_cmd(). `julia-runtest` cannot pass a
+      # cpu-target, so invoke Pkg.test() directly. See repro_bcast_mkl.jl.
+      - name: Run tests (AVX512-FP16 disabled)
         continue-on-error: true
+        run: julia -C "native,-avx512fp16" --color=yes --project=. -e 'import Pkg; Pkg.test()'