From 756d333a110db7e1c2a7b26daa6a3c25d78bb627 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Tue, 5 May 2026 11:27:09 -0500 Subject: [PATCH 01/12] Move to LTS stack --- Project.toml | 12 +- deps/Project.toml | 6 +- deps/build_local.jl | 5 +- deps/generate_interfaces.jl | 26 ++- deps/src/onemkl.cpp | 200 ++++++++++++++---- deps/src/onemkl.h | 152 ++++++++++---- lib/level-zero/oneL0.jl | 6 +- lib/support/liboneapi_support.jl | 348 +++++++++++++++++++++++++------ res/Project.toml | 2 +- res/wrap.jl | 2 +- src/oneAPI.jl | 2 +- src/utils.jl | 2 +- test/Project.toml | 4 +- 13 files changed, 606 insertions(+), 161 deletions(-) diff --git a/Project.toml b/Project.toml index f84b9c27..c997a054 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -NEO_jll = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +NEO_LTS_jll = "a724f90f-ce79-56dd-a1bd-b9de5a61085f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -26,8 +26,8 @@ SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d" -oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01" +oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314" +oneAPI_Level_Zero_Loader_LTS_jll = "f6e5cbb4-ba2a-56dc-92a2-9d66f5656ccd" oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36" [compat] @@ -41,7 +41,7 @@ GPUCompiler = "1.6" GPUToolbox = "0.1, 0.2, 0.3, 1" KernelAbstractions = "0.9.39" LLVM = "6, 7, 8, 9" -NEO_jll = "=25.44.36015" +NEO_LTS_jll = "=25.18.33578" Preferences = "1" SPIRVIntrinsics = "0.5" SPIRV_LLVM_Translator_jll = "21" @@ -49,8 +49,8 @@ SPIRV_Tools_jll = "2025.4.0" SpecialFunctions = "1.3, 2" StaticArrays = "1" julia = "1.10" -oneAPI_Level_Zero_Loader_jll = "1.25" +oneAPI_Level_Zero_Loader_LTS_jll = "=1.24" oneAPI_Support_jll = "0.9.2" [extras] -libigc_jll = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +libigc_LTS_jll = "9a8258a1-e827-5686-bee9-144461246960" diff --git a/deps/Project.toml b/deps/Project.toml index b8446dbe..8833df53 100644 --- a/deps/Project.toml +++ b/deps/Project.toml @@ -8,8 +8,8 @@ Ninja_jll = "76642167-d241-5cee-8c94-7a494e8cb7b7" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Scratch = "6c6a2e73-6563-6170-7368-637461726353" -oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d" -oneAPI_Support_Headers_jll = "24f86df5-245d-5634-a4cc-32433d9800b3" +oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314" +oneAPI_Support_Headers_LTS_jll = "0e9de0da-c0b6-5d6c-9871-5c996d414ca7" [compat] -oneAPI_Support_Headers_jll = "=2025.2.0" +oneAPI_Support_Headers_LTS_jll = "=2025.3.1" diff --git a/deps/build_local.jl b/deps/build_local.jl index f2d620e0..4d613732 100644 --- a/deps/build_local.jl +++ b/deps/build_local.jl @@ -20,7 +20,8 @@ if haskey(ENV, "BUILDKITE") run(`buildkite-agent annotate 'Using a locally-built support library; A bump of oneAPI_Support_jll is required before releasing this packages.' --style 'warning' --context 'ctx-deps'`) end -using Scratch, Preferences, CMake_jll, Ninja_jll, oneAPI_Level_Zero_Headers_jll +using Scratch, Preferences, CMake_jll, Ninja_jll +import oneAPI_Level_Zero_Headers_LTS_jll as oneAPI_Level_Zero_Headers_jll oneAPI = Base.UUID("8f75cd03-7ff8-4ecb-9b8f-daf728133b1b") @@ -62,7 +63,7 @@ if !isfile(joinpath(conda_dir, "condarc-julia.yml")) touch(joinpath(conda_dir, "conda-meta", "history")) end Conda.add_channel("https://software.repos.intel.com/python/conda/", conda_dir) -Conda.add(["dpcpp_linux-64=2025.2.0", "mkl-devel-dpcpp=2025.2.0"], conda_dir) +Conda.add(["dpcpp_linux-64=2025.3.1", "mkl-devel-dpcpp=2025.3.1"], conda_dir) Conda.list(conda_dir) diff --git a/deps/generate_interfaces.jl b/deps/generate_interfaces.jl index 108518db..8c62b757 100644 --- a/deps/generate_interfaces.jl +++ b/deps/generate_interfaces.jl @@ -1,4 +1,4 @@ -using oneAPI_Support_Headers_jll +import oneAPI_Support_Headers_LTS_jll as oneAPI_Support_Headers_jll include("generate_helpers.jl") @@ -337,12 +337,34 @@ function generate_headers(library::String, filename::Vector{String}, output::Str end end + # Dedup: when two signatures map to the same C function name (because MKL + # added an overload), keep the one with more parameters — typically the + # newer signature (e.g. set_csr_data gained an `nnz` arg in MKL 2025.3.1). + # Without this the generated onemkl.cpp has duplicate function definitions + # and won't compile. + _fn_name(h) = (pos = findfirst('(', h); strip(split(strip(h[1:pos-1]))[end])) + _param_cnt(h) = (pos = findfirst('(', h); ep = findnext(')', h, pos); count(==(','), h[pos+1:ep-1]) + 1) + keep_idx = Dict{String,Int}() + keep_pc = Dict{String,Int}() + for (i, sig) in enumerate(signatures) + (sig[2] in blacklist) && continue + fn = _fn_name(sig[1]) + pc = _param_cnt(sig[1]) + if !haskey(keep_idx, fn) || pc > keep_pc[fn] + keep_idx[fn] = i + keep_pc[fn] = pc + end + end + keep_set = Set(values(keep_idx)) + path_oneapi_headers = joinpath(@__DIR__, output) oneapi_headers = open(path_oneapi_headers, "w") - for (header, name_routine, version, type_routine, template) in signatures + for (i, (header, name_routine, version, type_routine, template)) in enumerate(signatures) # Blacklist (name_routine in blacklist) && continue + # Dedup + (i in keep_set) || continue # Pass scalars (e.g. alpha/beta inputs) as references instead of values for type in ("short", "float", "double", "float _Complex", "double _Complex") diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp index 2b880327..f9479c2f 100644 --- a/deps/src/onemkl.cpp +++ b/deps/src/onemkl.cpp @@ -5471,142 +5471,270 @@ extern "C" int64_t onemklZunmqr_batch_scratchpad_size(syclQueue_t device_queue, } // SPARSE -extern "C" int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spMat) { - oneapi::mkl::sparse::init_matrix_handle((oneapi::mkl::sparse::matrix_handle_t*) p_spMat); +extern "C" int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spmat) { + oneapi::mkl::sparse::init_matrix_handle((oneapi::mkl::sparse::matrix_handle_t*) p_spmat); return 0; } -extern "C" int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spMat) { +extern "C" int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spmat) { try { - auto status = oneapi::mkl::sparse::release_matrix_handle(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t*) p_spMat, {}); + auto status = oneapi::mkl::sparse::release_matrix_handle(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t*) p_spmat, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float *values) { +extern "C" int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float *values) { +extern "C" int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double *values) { +extern "C" int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double *values) { +extern "C" int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float _Complex *values) { +extern "C" int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, float _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float _Complex *values) { +extern "C" int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, float _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double _Complex *values) { +extern "C" int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, int32_t *col_ind, double _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double _Complex *values) { +extern "C" int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ptr, int64_t *col_ind, double _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ptr, col_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float *values) { +extern "C" int onemklSsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, float *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float *values) { +extern "C" int onemklSsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, float *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double *values) { +extern "C" int onemklDsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, double *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double *values) { +extern "C" int onemklDsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, double *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, values, {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float _Complex *values) { +extern "C" int onemklCsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, float _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float _Complex *values) { +extern "C" int onemklCsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, float _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double _Complex *values) { +extern "C" int onemklZsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, int32_t *row_ind, double _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast*>(values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; } -extern "C" int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double _Complex *values) { +extern "C" int onemklZsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *col_ptr, int64_t *row_ind, double _Complex *values) { try { - auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + auto status = oneapi::mkl::sparse::set_csc_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spMat, nrows, ncols, nnz, convert(index), col_ptr, row_ind, reinterpret_cast*>(values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float _Complex *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float _Complex *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double _Complex *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double _Complex *values) { + try { + auto status = oneapi::mkl::sparse::set_coo_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, nrows, ncols, nnz, convert(index), row_ind, col_ind, reinterpret_cast*>(values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklSsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, float *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklSsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, float *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklDsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, double *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklDsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, double *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, bsr_values, {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklCsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, float _Complex *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast*>(bsr_values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklCsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, float _Complex *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast*>(bsr_values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklZsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, double _Complex *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast*>(bsr_values), {}); + device_queue->val.wait_and_throw(); + } catch (const sycl::exception& e) { return -1; } + return 0; +} + +extern "C" int onemklZsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, onemklIndex index, int64_t *bsr_row_ptr, int64_t *bsr_col_ind, double _Complex *bsr_values) { + try { + auto status = oneapi::mkl::sparse::set_bsr_data(device_queue->val, (oneapi::mkl::sparse::matrix_handle_t) spmat, blk_nrows, blk_ncols, blk_nnz, row_blk_size, col_blk_size, convert(blk_layout), convert(index), bsr_row_ptr, bsr_col_ind, reinterpret_cast*>(bsr_values), {}); device_queue->val.wait_and_throw(); } catch (const sycl::exception& e) { return -1; } return 0; diff --git a/deps/src/onemkl.h b/deps/src/onemkl.h index 0029fb2e..4d1caa08 100644 --- a/deps/src/onemkl.h +++ b/deps/src/onemkl.h @@ -141,8 +141,8 @@ struct omatadd_descr; typedef struct omatadd_descr *omatadd_descr_t; const int64_t ONEMKL_VERSION_MAJOR = 2025; -const int64_t ONEMKL_VERSION_MINOR = 2; -const int64_t ONEMKL_VERSION_PATCH = 0; +const int64_t ONEMKL_VERSION_MINOR = 3; +const int64_t ONEMKL_VERSION_PATCH = 1; void onemkl_version(int64_t *major, int64_t *minor, int64_t *patch); int onemklHgemm_batch(syclQueue_t device_queue, onemklTranspose transa, @@ -2737,74 +2737,154 @@ int64_t onemklZunmqr_batch_scratchpad_size(syclQueue_t device_queue, onemklSide* group_count, int64_t* group_sizes); // SPARSE -int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spMat); +int onemklXsparse_init_matrix_handle(matrix_handle_t *p_spmat); -int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spMat); +int onemklXsparse_release_matrix_handle(syclQueue_t device_queue, matrix_handle_t *p_spmat); -int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, - int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t - *col_ind, float *values); +int onemklSsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, + int32_t *col_ind, float *values); -int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t - nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, - int64_t *col_ind, float *values); +int onemklSsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *row_ptr, int64_t *col_ind, float *values); -int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, - int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t - *col_ind, double *values); +int onemklDsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, + int32_t *col_ind, double *values); -int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t - nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, - int64_t *col_ind, double *values); +int onemklDsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *row_ptr, int64_t *col_ind, double *values); -int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, - int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t - *col_ind, float _Complex *values); +int onemklCsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, + int32_t *col_ind, float _Complex *values); -int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t - nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, - int64_t *col_ind, float _Complex *values); +int onemklCsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *row_ptr, int64_t *col_ind, float _Complex *values); -int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, - int32_t ncols, onemklIndex index, int32_t *row_ptr, int32_t - *col_ind, double _Complex *values); +int onemklZsparse_set_csr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *row_ptr, + int32_t *col_ind, double _Complex *values); + +int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *row_ptr, int64_t *col_ind, double _Complex *values); -int onemklZsparse_set_csr_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t - nrows, int64_t ncols, onemklIndex index, int64_t *row_ptr, - int64_t *col_ind, double _Complex *values); +int onemklSsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, + int32_t *row_ind, float *values); + +int onemklSsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *col_ptr, int64_t *row_ind, float *values); -int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, +int onemklDsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, + int32_t *row_ind, double *values); + +int onemklDsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *col_ptr, int64_t *row_ind, double *values); + +int onemklCsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, + int32_t *row_ind, float _Complex *values); + +int onemklCsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *col_ptr, int64_t *row_ind, float _Complex *values); + +int onemklZsparse_set_csc_data(syclQueue_t device_queue, matrix_handle_t spMat, int64_t nrows, + int64_t ncols, int64_t nnz, onemklIndex index, int32_t *col_ptr, + int32_t *row_ind, double _Complex *values); + +int onemklZsparse_set_csc_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t + nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t + *col_ptr, int64_t *row_ind, double _Complex *values); + +int onemklSsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float *values); -int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t +int onemklSsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float *values); -int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, +int onemklDsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double *values); -int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t +int onemklDsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double *values); -int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, +int onemklCsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, float _Complex *values); -int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t +int onemklCsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, float _Complex *values); -int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spMat, int32_t nrows, +int onemklZsparse_set_coo_data(syclQueue_t device_queue, matrix_handle_t spmat, int32_t nrows, int32_t ncols, int32_t nnz, onemklIndex index, int32_t *row_ind, int32_t *col_ind, double _Complex *values); -int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spMat, int64_t +int onemklZsparse_set_coo_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t nrows, int64_t ncols, int64_t nnz, onemklIndex index, int64_t *row_ind, int64_t *col_ind, double _Complex *values); +int onemklSsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, + float *bsr_values); + +int onemklSsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int64_t *bsr_row_ptr, int64_t + *bsr_col_ind, float *bsr_values); + +int onemklDsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, + double *bsr_values); + +int onemklDsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int64_t *bsr_row_ptr, int64_t + *bsr_col_ind, double *bsr_values); + +int onemklCsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, + float _Complex *bsr_values); + +int onemklCsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int64_t *bsr_row_ptr, int64_t + *bsr_col_ind, float _Complex *bsr_values); + +int onemklZsparse_set_bsr_data(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int32_t *bsr_row_ptr, int32_t *bsr_col_ind, + double _Complex *bsr_values); + +int onemklZsparse_set_bsr_data_64(syclQueue_t device_queue, matrix_handle_t spmat, int64_t + blk_nrows, int64_t blk_ncols, int64_t blk_nnz, int64_t + row_blk_size, int64_t col_blk_size, onemklLayout blk_layout, + onemklIndex index, int64_t *bsr_row_ptr, int64_t + *bsr_col_ind, double _Complex *bsr_values); + int onemklXsparse_init_matmat_descr(matmat_descr_t *p_desc); int onemklXsparse_release_matmat_descr(matmat_descr_t *p_desc); diff --git a/lib/level-zero/oneL0.jl b/lib/level-zero/oneL0.jl index f9697029..6bcb17fb 100644 --- a/lib/level-zero/oneL0.jl +++ b/lib/level-zero/oneL0.jl @@ -11,8 +11,10 @@ using Libdl if Sys.iswindows() const libze_loader = "ze_loader" else - using NEO_jll - using oneAPI_Level_Zero_Loader_jll + using NEO_LTS_jll + using oneAPI_Level_Zero_Loader_LTS_jll + const NEO_jll = NEO_LTS_jll + const oneAPI_Level_Zero_Loader_jll = oneAPI_Level_Zero_Loader_LTS_jll end include("utils.jl") diff --git a/lib/support/liboneapi_support.jl b/lib/support/liboneapi_support.jl index 91146c99..5f30a9f1 100644 --- a/lib/support/liboneapi_support.jl +++ b/lib/support/liboneapi_support.jl @@ -6428,107 +6428,195 @@ function onemklZunmqr_batch_scratchpad_size(device_queue, side, trans, m, n, k, group_sizes::Ptr{Int64})::Int64 end -function onemklXsparse_init_matrix_handle(p_spMat) - @ccall liboneapi_support.onemklXsparse_init_matrix_handle(p_spMat::Ptr{matrix_handle_t})::Cint +function onemklXsparse_init_matrix_handle(p_spmat) + @ccall liboneapi_support.onemklXsparse_init_matrix_handle(p_spmat::Ptr{matrix_handle_t})::Cint end -function onemklXsparse_release_matrix_handle(device_queue, p_spMat) +function onemklXsparse_release_matrix_handle(device_queue, p_spmat) @ccall liboneapi_support.onemklXsparse_release_matrix_handle(device_queue::syclQueue_t, - p_spMat::Ptr{matrix_handle_t})::Cint + p_spmat::Ptr{matrix_handle_t})::Cint end -function onemklSsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr, +function onemklSsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, col_ind, values) @ccall liboneapi_support.onemklSsparse_set_csr_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, - nrows::Int32, ncols::Int32, - index::onemklIndex, + spmat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::ZePtr{Int32}, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Int32}, - values::ZePtr{Cfloat})::Cint + col_ind::ZePtr{Cfloat}, + values::Ptr{Cfloat})::Cint end -function onemklSsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr, - col_ind, values) +function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, + row_ptr, col_ind, values) @ccall liboneapi_support.onemklSsparse_set_csr_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - index::onemklIndex, + nnz::Int64, index::ZePtr{Int64}, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Int64}, - values::ZePtr{Cfloat})::Cint + col_ind::ZePtr{Cfloat}, + values::Ptr{Cfloat})::Cint end -function onemklDsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr, +function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, col_ind, values) @ccall liboneapi_support.onemklDsparse_set_csr_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, - nrows::Int32, ncols::Int32, - index::onemklIndex, + spmat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::ZePtr{Int32}, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Int32}, - values::ZePtr{Cdouble})::Cint + col_ind::ZePtr{Cdouble}, + values::Ptr{Cdouble})::Cint end -function onemklDsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr, - col_ind, values) +function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, + row_ptr, col_ind, values) @ccall liboneapi_support.onemklDsparse_set_csr_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - index::onemklIndex, + nnz::Int64, index::ZePtr{Int64}, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Int64}, - values::ZePtr{Cdouble})::Cint + col_ind::ZePtr{Cdouble}, + values::Ptr{Cdouble})::Cint end -function onemklCsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr, +function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, col_ind, values) @ccall liboneapi_support.onemklCsparse_set_csr_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, - nrows::Int32, ncols::Int32, - index::onemklIndex, + spmat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::ZePtr{Int32}, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Int32}, - values::ZePtr{ComplexF32})::Cint + col_ind::ZePtr{ComplexF32}, + values::Ptr{ComplexF32})::Cint end -function onemklCsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr, - col_ind, values) +function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, + row_ptr, col_ind, values) @ccall liboneapi_support.onemklCsparse_set_csr_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - index::onemklIndex, + nnz::Int64, index::ZePtr{Int64}, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Int64}, - values::ZePtr{ComplexF32})::Cint + col_ind::ZePtr{ComplexF32}, + values::Ptr{ComplexF32})::Cint end -function onemklZsparse_set_csr_data(device_queue, spMat, nrows, ncols, index, row_ptr, +function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, col_ind, values) @ccall liboneapi_support.onemklZsparse_set_csr_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, - nrows::Int32, ncols::Int32, - index::onemklIndex, + spmat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::ZePtr{Int32}, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Int32}, - values::ZePtr{ComplexF64})::Cint + col_ind::ZePtr{ComplexF64}, + values::Ptr{ComplexF32})::Cint end -function onemklZsparse_set_csr_data_64(device_queue, spMat, nrows, ncols, index, row_ptr, - col_ind, values) +function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, + row_ptr, col_ind, values) @ccall liboneapi_support.onemklZsparse_set_csr_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - index::onemklIndex, + nnz::Int64, index::ZePtr{Int64}, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Int64}, - values::ZePtr{ComplexF64})::Cint + col_ind::ZePtr{ComplexF64}, + values::Ptr{ComplexF32})::Cint +end + +function onemklSsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr, + row_ind, values) + @ccall liboneapi_support.onemklSsparse_set_csc_data(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int32}, + row_ind::Ptr{Int32}, + values::Ptr{Cfloat})::Cint +end + +function onemklSsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index, + col_ptr, row_ind, values) + @ccall liboneapi_support.onemklSsparse_set_csc_data_64(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int64}, + row_ind::Ptr{Int64}, + values::Ptr{Cfloat})::Cint +end + +function onemklDsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr, + row_ind, values) + @ccall liboneapi_support.onemklDsparse_set_csc_data(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int32}, + row_ind::Ptr{Int32}, + values::Ptr{Cdouble})::Cint +end + +function onemklDsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index, + col_ptr, row_ind, values) + @ccall liboneapi_support.onemklDsparse_set_csc_data_64(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int64}, + row_ind::Ptr{Int64}, + values::Ptr{Cdouble})::Cint +end + +function onemklCsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr, + row_ind, values) + @ccall liboneapi_support.onemklCsparse_set_csc_data(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int32}, + row_ind::Ptr{Int32}, + values::Ptr{ComplexF32})::Cint +end + +function onemklCsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index, + col_ptr, row_ind, values) + @ccall liboneapi_support.onemklCsparse_set_csc_data_64(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int64}, + row_ind::Ptr{Int64}, + values::Ptr{ComplexF32})::Cint end -function onemklSsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind, +function onemklZsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr, + row_ind, values) + @ccall liboneapi_support.onemklZsparse_set_csc_data(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int32}, + row_ind::Ptr{Int32}, + values::Ptr{ComplexF32})::Cint +end + +function onemklZsparse_set_csc_data_64(device_queue, spMat, nrows, ncols, nnz, index, + col_ptr, row_ind, values) + @ccall liboneapi_support.onemklZsparse_set_csc_data_64(device_queue::syclQueue_t, + spMat::matrix_handle_t, + nrows::Int64, ncols::Int64, + nnz::Int64, index::onemklIndex, + col_ptr::Ptr{Int64}, + row_ind::Ptr{Int64}, + values::Ptr{ComplexF32})::Cint +end + +function onemklSsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklSsparse_set_coo_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int32, ncols::Int32, nnz::Int32, index::onemklIndex, row_ind::ZePtr{Int32}, @@ -6536,10 +6624,10 @@ function onemklSsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde values::ZePtr{Cfloat})::Cint end -function onemklSsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index, +function onemklSsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklSsparse_set_coo_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, nnz::Int64, index::onemklIndex, row_ind::ZePtr{Int64}, @@ -6547,10 +6635,10 @@ function onemklSsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i values::ZePtr{Cfloat})::Cint end -function onemklDsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind, +function onemklDsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklDsparse_set_coo_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int32, ncols::Int32, nnz::Int32, index::onemklIndex, row_ind::ZePtr{Int32}, @@ -6558,10 +6646,10 @@ function onemklDsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde values::ZePtr{Cdouble})::Cint end -function onemklDsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index, +function onemklDsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklDsparse_set_coo_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, nnz::Int64, index::onemklIndex, row_ind::ZePtr{Int64}, @@ -6569,10 +6657,10 @@ function onemklDsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i values::ZePtr{Cdouble})::Cint end -function onemklCsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind, +function onemklCsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklCsparse_set_coo_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int32, ncols::Int32, nnz::Int32, index::onemklIndex, row_ind::ZePtr{Int32}, @@ -6580,10 +6668,10 @@ function onemklCsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde values::ZePtr{ComplexF32})::Cint end -function onemklCsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index, +function onemklCsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklCsparse_set_coo_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, nnz::Int64, index::onemklIndex, row_ind::ZePtr{Int64}, @@ -6591,10 +6679,10 @@ function onemklCsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i values::ZePtr{ComplexF32})::Cint end -function onemklZsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, index, row_ind, +function onemklZsparse_set_coo_data(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklZsparse_set_coo_data(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int32, ncols::Int32, nnz::Int32, index::onemklIndex, row_ind::ZePtr{Int32}, @@ -6602,10 +6690,10 @@ function onemklZsparse_set_coo_data(device_queue, spMat, nrows, ncols, nnz, inde values::ZePtr{ComplexF64})::Cint end -function onemklZsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, index, +function onemklZsparse_set_coo_data_64(device_queue, spmat, nrows, ncols, nnz, index, row_ind, col_ind, values) @ccall liboneapi_support.onemklZsparse_set_coo_data_64(device_queue::syclQueue_t, - spMat::matrix_handle_t, + spmat::matrix_handle_t, nrows::Int64, ncols::Int64, nnz::Int64, index::onemklIndex, row_ind::ZePtr{Int64}, @@ -6613,6 +6701,130 @@ function onemklZsparse_set_coo_data_64(device_queue, spMat, nrows, ncols, nnz, i values::ZePtr{ComplexF64})::Cint end +function onemklSsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklSsparse_set_bsr_data(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, blk_ncols::Int64, + blk_nnz::Int64, row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int32}, + bsr_col_ind::Ptr{Int32}, + bsr_values::Ptr{Cfloat})::Cint +end + +function onemklSsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklSsparse_set_bsr_data_64(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, + blk_ncols::Int64, blk_nnz::Int64, + row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int64}, + bsr_col_ind::Ptr{Int64}, + bsr_values::Ptr{Cfloat})::Cint +end + +function onemklDsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklDsparse_set_bsr_data(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, blk_ncols::Int64, + blk_nnz::Int64, row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int32}, + bsr_col_ind::Ptr{Int32}, + bsr_values::Ptr{Cdouble})::Cint +end + +function onemklDsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklDsparse_set_bsr_data_64(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, + blk_ncols::Int64, blk_nnz::Int64, + row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int64}, + bsr_col_ind::Ptr{Int64}, + bsr_values::Ptr{Cdouble})::Cint +end + +function onemklCsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklCsparse_set_bsr_data(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, blk_ncols::Int64, + blk_nnz::Int64, row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int32}, + bsr_col_ind::Ptr{Int32}, + bsr_values::Ptr{ComplexF32})::Cint +end + +function onemklCsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklCsparse_set_bsr_data_64(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, + blk_ncols::Int64, blk_nnz::Int64, + row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int64}, + bsr_col_ind::Ptr{Int64}, + bsr_values::Ptr{ComplexF32})::Cint +end + +function onemklZsparse_set_bsr_data(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklZsparse_set_bsr_data(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, blk_ncols::Int64, + blk_nnz::Int64, row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int32}, + bsr_col_ind::Ptr{Int32}, + bsr_values::Ptr{ComplexF32})::Cint +end + +function onemklZsparse_set_bsr_data_64(device_queue, spmat, blk_nrows, blk_ncols, blk_nnz, + row_blk_size, col_blk_size, blk_layout, index, + bsr_row_ptr, bsr_col_ind, bsr_values) + @ccall liboneapi_support.onemklZsparse_set_bsr_data_64(device_queue::syclQueue_t, + spmat::matrix_handle_t, + blk_nrows::Int64, + blk_ncols::Int64, blk_nnz::Int64, + row_blk_size::Int64, + col_blk_size::Int64, + blk_layout::onemklLayout, + index::onemklIndex, + bsr_row_ptr::Ptr{Int64}, + bsr_col_ind::Ptr{Int64}, + bsr_values::Ptr{ComplexF32})::Cint +end + function onemklXsparse_init_matmat_descr(p_desc) @ccall liboneapi_support.onemklXsparse_init_matmat_descr(p_desc::Ptr{matmat_descr_t})::Cint end diff --git a/res/Project.toml b/res/Project.toml index f7b4a7fe..20a2ab05 100644 --- a/res/Project.toml +++ b/res/Project.toml @@ -1,4 +1,4 @@ [deps] Clang = "40e3b903-d033-50b4-a0cc-940c62c95e31" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" -oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d" +oneAPI_Level_Zero_Headers_LTS_jll = "d79c0b2e-896c-561b-aab9-323701ec0314" diff --git a/res/wrap.jl b/res/wrap.jl index 1d48315e..a86428d2 100644 --- a/res/wrap.jl +++ b/res/wrap.jl @@ -108,7 +108,7 @@ end # Main application # -using oneAPI_Level_Zero_Headers_jll +import oneAPI_Level_Zero_Headers_LTS_jll as oneAPI_Level_Zero_Headers_jll function main() wrap("ze", oneAPI_Level_Zero_Headers_jll.ze_api) diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 33afe596..6b65d744 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -148,7 +148,7 @@ function __init__() end function set_debug!(debug::Bool) - for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_jll] + for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_LTS_jll] Preferences.set_preferences!(jll, "debug" => string(debug); force=true) end @info "oneAPI debug mode $(debug ? "enabled" : "disabled"); please re-start Julia." diff --git a/src/utils.jl b/src/utils.jl index 0516de03..e7d232ec 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -2,7 +2,7 @@ function versioninfo(io::IO=stdout) if Sys.islinux() println(io, "Binary dependencies:") - for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_jll, oneL0.NEO_jll.gmmlib_jll, + for jll in [oneL0.NEO_jll, oneL0.NEO_jll.libigc_LTS_jll, oneL0.NEO_jll.gmmlib_jll, SPIRV_LLVM_Translator_jll, SPIRV_Tools_jll, oneAPI_Support_jll] name = string(jll) print(io, "- $(name[1:end-4]): $(Base.pkgversion(jll))") diff --git a/test/Project.toml b/test/Project.toml index 1e877a7e..fa5557e9 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -9,7 +9,7 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -NEO_jll = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd" +NEO_LTS_jll = "a724f90f-ce79-56dd-a1bd-b9de5a61085f" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" @@ -19,7 +19,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -libigc_jll = "94295238-5935-5bd7-bb0f-b00942e9bdd5" +libigc_LTS_jll = "9a8258a1-e827-5686-bee9-144461246960" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36" From d427139d6a9dcd9c97339e61b44c37caacd4da9b Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Mon, 11 May 2026 11:56:39 -0500 Subject: [PATCH 02/12] NEO link fix --- src/oneAPI.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 6b65d744..5e6b349f 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -138,6 +138,15 @@ function __init__() else # ensure that the OpenCL loader finds the ICD files from our artifacts ENV["OCL_ICD_FILENAMES"] = oneL0.NEO_jll.libigdrcl + + # ensure that libsycl's bundled ze_lib finds NEO's libze_intel_gpu via + # path-based driver discovery (it does not reuse the JLL-loaded module). + # Required when no system NEO is installed. + neo_libdir = dirname(oneL0.NEO_jll.libze_intel_gpu) + ld = get(ENV, "LD_LIBRARY_PATH", "") + if !occursin(neo_libdir, ld) + ENV["LD_LIBRARY_PATH"] = isempty(ld) ? neo_libdir : "$neo_libdir:$ld" + end end # XXX: work around an issue with SYCL/Level Zero interoperability From 41bd558aab18ea78c6914330a9295e0236e4815c Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 5 Jun 2026 17:17:40 +0000 Subject: [PATCH 03/12] Fix oneMKL sparse CSR bindings for the LTS stack --- lib/mkl/wrappers_sparse.jl | 4 +-- lib/support/liboneapi_support.jl | 48 ++++++++++++++++---------------- res/support.toml | 10 ++++--- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/lib/mkl/wrappers_sparse.jl b/lib/mkl/wrappers_sparse.jl index 8e58956b..92cd0d43 100644 --- a/lib/mkl/wrappers_sparse.jl +++ b/lib/mkl/wrappers_sparse.jl @@ -60,7 +60,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3 queue = global_queue(context(nzVal), device(nzVal)) # Don't update handle if matrix is empty if m != 0 && n != 0 - $fname(sycl_queue(queue), handle_ptr[], m, n, 'O', rowPtr, colVal, nzVal) + $fname(sycl_queue(queue), handle_ptr[], m, n, nnzA, 'O', rowPtr, colVal, nzVal) dA = oneSparseMatrixCSR{$elty, $intty}(handle_ptr[], rowPtr, colVal, nzVal, (m, n), nnzA) finalizer(sparse_release_matrix_handle, dA) else @@ -81,7 +81,7 @@ for (fname, elty, intty) in ((:onemklSsparse_set_csr_data , :Float32 , :Int3 nnzA = length(nzVal) # Don't update handle if matrix is empty if m != 0 && n != 0 - $fname(sycl_queue(queue), handle_ptr[], n, m, 'O', colPtr, rowVal, nzVal) # CSC of A is CSR of Aᵀ + $fname(sycl_queue(queue), handle_ptr[], n, m, nnzA, 'O', colPtr, rowVal, nzVal) # CSC of A is CSR of Aᵀ dA = oneSparseMatrixCSC{$elty, $intty}(handle_ptr[], colPtr, rowVal, nzVal, (m, n), nnzA) finalizer(sparse_release_matrix_handle, dA) else diff --git a/lib/support/liboneapi_support.jl b/lib/support/liboneapi_support.jl index 5f30a9f1..e1b7327f 100644 --- a/lib/support/liboneapi_support.jl +++ b/lib/support/liboneapi_support.jl @@ -6442,10 +6442,10 @@ function onemklSsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde @ccall liboneapi_support.onemklSsparse_set_csr_data(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int32}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Cfloat}, - values::Ptr{Cfloat})::Cint + col_ind::ZePtr{Int32}, + values::ZePtr{Cfloat})::Cint end function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, @@ -6453,10 +6453,10 @@ function onemklSsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i @ccall liboneapi_support.onemklSsparse_set_csr_data_64(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int64}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Cfloat}, - values::Ptr{Cfloat})::Cint + col_ind::ZePtr{Int64}, + values::ZePtr{Cfloat})::Cint end function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, @@ -6464,10 +6464,10 @@ function onemklDsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde @ccall liboneapi_support.onemklDsparse_set_csr_data(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int32}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{Cdouble}, - values::Ptr{Cdouble})::Cint + col_ind::ZePtr{Int32}, + values::ZePtr{Cdouble})::Cint end function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, @@ -6475,10 +6475,10 @@ function onemklDsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i @ccall liboneapi_support.onemklDsparse_set_csr_data_64(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int64}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{Cdouble}, - values::Ptr{Cdouble})::Cint + col_ind::ZePtr{Int64}, + values::ZePtr{Cdouble})::Cint end function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, @@ -6486,10 +6486,10 @@ function onemklCsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde @ccall liboneapi_support.onemklCsparse_set_csr_data(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int32}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{ComplexF32}, - values::Ptr{ComplexF32})::Cint + col_ind::ZePtr{Int32}, + values::ZePtr{ComplexF32})::Cint end function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, @@ -6497,10 +6497,10 @@ function onemklCsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i @ccall liboneapi_support.onemklCsparse_set_csr_data_64(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int64}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{ComplexF32}, - values::Ptr{ComplexF32})::Cint + col_ind::ZePtr{Int64}, + values::ZePtr{ComplexF32})::Cint end function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, index, row_ptr, @@ -6508,10 +6508,10 @@ function onemklZsparse_set_csr_data(device_queue, spmat, nrows, ncols, nnz, inde @ccall liboneapi_support.onemklZsparse_set_csr_data(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int32}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int32}, - col_ind::ZePtr{ComplexF64}, - values::Ptr{ComplexF32})::Cint + col_ind::ZePtr{Int32}, + values::ZePtr{ComplexF64})::Cint end function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, index, @@ -6519,10 +6519,10 @@ function onemklZsparse_set_csr_data_64(device_queue, spmat, nrows, ncols, nnz, i @ccall liboneapi_support.onemklZsparse_set_csr_data_64(device_queue::syclQueue_t, spmat::matrix_handle_t, nrows::Int64, ncols::Int64, - nnz::Int64, index::ZePtr{Int64}, + nnz::Int64, index::onemklIndex, row_ptr::ZePtr{Int64}, - col_ind::ZePtr{ComplexF64}, - values::Ptr{ComplexF32})::Cint + col_ind::ZePtr{Int64}, + values::ZePtr{ComplexF64})::Cint end function onemklSsparse_set_csc_data(device_queue, spMat, nrows, ncols, nnz, index, col_ptr, diff --git a/res/support.toml b/res/support.toml index 230ac406..ac641b90 100644 --- a/res/support.toml +++ b/res/support.toml @@ -370,15 +370,17 @@ use_ccall_macro = true 6 = "ZePtr{Float32}" 8 = "Ref{Float32}" +# Argument positions account for the `nnz` parameter (arg 5) and the +# `index::onemklIndex` enum (arg 6); the device pointers are args 7-9. [api.onemklXsparse_set_csr_data.argtypes] -6 = "ZePtr{Int32}" 7 = "ZePtr{Int32}" -8 = "ZePtr{T}" +8 = "ZePtr{Int32}" +9 = "ZePtr{T}" [api.onemklXsparse_set_csr_data_64.argtypes] -6 = "ZePtr{Int64}" 7 = "ZePtr{Int64}" -8 = "ZePtr{T}" +8 = "ZePtr{Int64}" +9 = "ZePtr{T}" [api.onemklXsparse_set_coo_data.argtypes] 7 = "ZePtr{Int32}" From c6d068cc986a715f1e050cac2c95ead88b978f99 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Tue, 9 Jun 2026 16:02:43 +0000 Subject: [PATCH 04/12] Fix silent corruption in strided mapreducedim! on the LTS stack --- src/mapreduce.jl | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 822b9b16..28725c84 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -105,6 +105,32 @@ function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R, return end +# Coalesced reduction for when the contiguous leading dimension is NOT reduced (the reduced +# axes are strided). One work-item per output slice (Rother element), grid-strided; each +# serially reduces over Rreduce. Consecutive work-items map to consecutive output slices, +# which are consecutive in memory, so global reads are coalesced across lanes — the access +# pattern a `dims=1` reduction already uses. On the Aurora LTS stack the workgroup-per-slice +# kernel above reads a *strided* reduced dimension non-coalesced, which silently corrupts +# large reductions (e.g. `sum(A; dims=2)`); this path avoids that pattern entirely. +function coalesced_mapreduce_device(f, op, neutral, Rreduce, Rother, R, As...) + iother = (get_group_id() - 1) * get_local_size() + get_local_id() + gstride = get_num_groups() * get_local_size() + @inbounds while iother <= length(Rother) + Iother = Rother[iother] + Iout = CartesianIndex(Tuple(Iother)..., 1) + neut = neutral === nothing ? R[Iout] : neutral + val = op(neut, neut) + for ireduce in 1:length(Rreduce) + Ireduce = Rreduce[ireduce] + J = max(Iother, Ireduce) + val = op(val, f(_map_getindex(As, J)...)) + end + R[Iout] = val + iother += gstride + end + return +end + ## COV_EXCL_STOP function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T}, @@ -133,6 +159,20 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T}, # but allows us to write a generalized kernel supporting partial reductions. R′ = reshape(R, (size(R)..., 1)) + # Aurora LTS workaround: the workgroup-per-slice kernel below reads a *strided* reduced + # dimension non-coalesced, which silently corrupts reductions on this stack (regardless of + # output count — it depends on the reduction length, not the number of slices). Whenever + # the contiguous leading dimension is NOT reduced (`size(Rreduce, 1) == 1`), use the + # coalesced one-work-item-per-slice kernel, whose lanes read consecutive memory. Few-slice + # reductions get less parallelism but stay correct; the common many-slice case is also fast. + if size(Rreduce, 1) == 1 + items = clamp(length(Rother), 1, 256) + groups = min(cld(length(Rother), items), 1024) + @oneapi items=items groups=groups coalesced_mapreduce_device( + f, op, init, Rreduce, Rother, R′, A) + return R + end + # how many items do we want? # # items in a group work together to reduce values across the reduction dimensions; From 34545ec8f7cd372f5ca0397349b5e0331cd9d3c2 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 10 Jun 2026 19:11:51 +0000 Subject: [PATCH 05/12] Synchronize queues before freeing buffers on the LTS stack --- src/context.jl | 51 +++++++++++++++++++++++++++++++++++++++++++++++++- src/pool.jl | 9 +++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/src/context.jl b/src/context.jl index 88035e74..14f1e593 100644 --- a/src/context.jl +++ b/src/context.jl @@ -224,8 +224,57 @@ function global_queue(ctx::ZeContext, dev::ZeDevice) # NOTE: dev purposefully does not default to context() or device() to stress that # objects should track ownership, and not rely on implicit global state. get!(task_local_storage(), (:ZeCommandQueue, ctx, dev)) do - ZeCommandQueue(ctx, dev; flags = oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER) + queue = ZeCommandQueue(ctx, dev; flags = oneL0.ZE_COMMAND_QUEUE_FLAG_IN_ORDER) + # disable finalizers while mutating the registry: a GC-driven finalizer on this + # task could call back into `synchronize_all_queues` (the lock is reentrant) and + # observe/mutate the registry mid-update. + GC.enable_finalizers(false) + try + @lock queue_registry_lock begin + push!(get!(Vector{WeakRef}, queue_registry, (ctx, dev)), WeakRef(queue)) + end + finally + GC.enable_finalizers(true) + end + queue + end +end + +# Registry of all queues created through `global_queue`, across tasks. Buffers can be +# freed from any task (GC finalizers), so `release` needs to be able to find the queues +# that may still have work in flight referencing the buffer; queues themselves are +# cached task-locally and would otherwise be unreachable from the finalizing task. +const queue_registry_lock = ReentrantLock() +const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{WeakRef}}() + +# synchronize all known queues that target the given context (and device, if specified), +# i.e., all queues whose in-flight work could possibly reference an allocation that is +# about to be freed. +function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing}) + queues = ZeCommandQueue[] + GC.enable_finalizers(false) + try + @lock queue_registry_lock begin + for ((qctx, qdev), refs) in queue_registry + qctx == ctx || continue + (dev === nothing || qdev == dev) || continue + filter!(refs) do ref + queue = ref.value + queue === nothing && return false + push!(queues, queue) + true + end + end + end + finally + GC.enable_finalizers(true) + end + # synchronize outside the lock: this can block for as long as a kernel runs, + # and finalizers running concurrently also need to take the lock. + for queue in queues + oneL0.synchronize(queue) end + return end """ diff --git a/src/pool.jl b/src/pool.jl index 165a7f07..e541d26e 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -77,6 +77,15 @@ function release(buf::oneL0.AbstractBuffer) # evict(ctx, dev, buf) #end + # NEO (at least the 25.18 LTS release) does not honor the BLOCKING_FREE/DEFER_FREE + # policies of zeMemFreeExt: it advertises ZE_extension_memory_free_policies but + # unmaps the allocation immediately, even with work in flight that references it. + # That turns a GC-driven free of a dead array whose last kernel/copy hasn't retired + # into a GPU pagefault, which gets the kernel context banned and makes every later + # submission fail with ZE_RESULT_ERROR_UNKNOWN. Synchronize the queues that could + # reference this buffer before freeing. + synchronize_all_queues(oneL0.context(buf), oneL0.device(buf)) + free(buf; policy=oneL0.ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE) # TODO: queue-ordered free from non-finalizer tasks once we have From be62729ad47134d0e6bf457f755b63f961df72a7 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Wed, 10 Jun 2026 20:15:13 +0000 Subject: [PATCH 06/12] Fix GC rooting and SYCL queue lifetime in the oneMKL FFT wrappers plan_fft and friends passed `pointer(lengths)`/`pointer(strides)` to the onemklDft* ccall wrappers. A raw Ptr does not root the vector, so under GC pressure it was collected mid-call and oneMKL read garbage through the dangling pointer, making multi-dimensional plan commits fail nondeterministically (invalid_descriptor_exception or SIGFPE, surfacing as DivideError in the fft testset). Pass the arrays themselves so the ccall roots them. Also create the plan's SYCL queue through the cached task-local sycl_queue() accessor like the other oneMKL wrappers, instead of fresh syclContext/syclQueue objects per plan whose finalizers tear down SYCL runtime state for the still-in-use underlying queue. Harden the free-path queue synchronization against finalized queues: ZeCommandQueue now nulls its handle on destroy, and synchronize_all_queues skips destroyed queues and keeps finalizers disabled until it is done synchronizing. Co-Authored-By: Claude Fable 5 --- lib/level-zero/cmdqueue.jl | 4 ++++ lib/mkl/fft.jl | 32 +++++++++++++++++++------------- src/context.jl | 13 ++++++++----- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/lib/level-zero/cmdqueue.jl b/lib/level-zero/cmdqueue.jl index 3004a145..e123038a 100644 --- a/lib/level-zero/cmdqueue.jl +++ b/lib/level-zero/cmdqueue.jl @@ -21,6 +21,10 @@ mutable struct ZeCommandQueue obj = new(handle_ref[], ctx, dev, ordinal) finalizer(obj) do obj zeCommandQueueDestroy(obj) + # mark the queue as destroyed: it can still be weakly reachable (e.g. from + # the queue registry used by `synchronize_all_queues`), and synchronizing a + # destroyed handle crashes in the driver. + obj.handle = ze_command_queue_handle_t(C_NULL) end obj end diff --git a/lib/mkl/fft.jl b/lib/mkl/fft.jl index 5f5614b1..0ccdca46 100644 --- a/lib/mkl/fft.jl +++ b/lib/mkl/fft.jl @@ -95,16 +95,22 @@ function _create_descriptor(sz::NTuple{N,Int}, T::Type, complex::Bool) where {N} desc_ref = Ref{onemklDftDescriptor_t}() # Create descriptor for the full array dimensions lengths = collect(Int64, sz) - st = length(lengths) == 1 ? onemklDftCreate1D(desc_ref, prec, dom, lengths[1]) : onemklDftCreateND(desc_ref, prec, dom, length(lengths), pointer(lengths)) + # NB: pass the arrays themselves to the ccall wrappers (NOT `pointer(...)`) so they + # are rooted for the duration of the call; passing a raw Ptr lets the GC collect the + # vector mid-call, which made MKL read garbage lengths/strides and fail `commit` with + # invalid_descriptor_exception or a SIGFPE, depending on heap reuse. + st = length(lengths) == 1 ? onemklDftCreate1D(desc_ref, prec, dom, lengths[1]) : onemklDftCreateND(desc_ref, prec, dom, length(lengths), lengths) st == 0 || error("onemkl DFT create failed (status $st)") desc = desc_ref[] # Do not program descriptor scaling; we'll perform inverse normalization manually. # Set placement explicitly based on plan type later - # Construct a SYCL queue from current Level Zero context/device (reuse global queue) + # Use the task-local cached SYCL queue wrapping the global Level Zero queue, like + # the other oneMKL wrappers do. Creating fresh syclContext/syclQueue objects per + # plan is unsound: once they become garbage their finalizers (syclQueueDestroy etc.) + # tear down SYCL runtime state for the still-in-use underlying queue, corrupting + # later DFT commits (SIGFPE) and crashing at process exit. ze_ctx = oneAPI.context(); ze_dev = oneAPI.device() - sycl_dev = SYCL.syclDevice(SYCL.syclPlatform(oneAPI.driver()), ze_dev) - sycl_ctx = SYCL.syclContext([sycl_dev], ze_ctx) - q = SYCL.syclQueue(sycl_ctx, sycl_dev, oneAPI.global_queue(ze_ctx, ze_dev)) + q = oneAPI.sycl_queue(oneAPI.global_queue(ze_ctx, ze_dev)) return desc, q end @@ -125,8 +131,8 @@ function plan_fft(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,Co strides[i+1] = prod prod *= size(X,i) end - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides)) - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides)) end stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)") return cMKLFFTPlan{T,MKLFFT_FORWARD,false,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing) @@ -144,8 +150,8 @@ function plan_bfft(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,C @inbounds for i in 1:N strides[i+1]=prod; prod*=size(X,i) end - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides)) - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides)) end stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)") return cMKLFFTPlan{T,MKLFFT_INVERSE,false,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing) @@ -165,8 +171,8 @@ function plan_fft!(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32,C @inbounds for i in 1:N strides[i+1]=prod; prod*=size(X,i) end - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides)) - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides)) end stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)") cMKLFFTPlan{T,MKLFFT_FORWARD,true,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing) @@ -184,8 +190,8 @@ function plan_bfft!(X::oneAPI.oneArray{T,N}, region) where {T<:Union{ComplexF32, @inbounds for i in 1:N strides[i+1]=prod; prod*=size(X,i) end - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, pointer(strides), length(strides)) - onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, pointer(strides), length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_FWD_STRIDES, strides, length(strides)) + onemklDftSetValueInt64Array(desc, ONEMKL_DFT_PARAM_BWD_STRIDES, strides, length(strides)) end stc = onemklDftCommit(desc, q); stc == 0 || error("commit failed ($stc)") cMKLFFTPlan{T,MKLFFT_INVERSE,true,N,R,Nothing}(desc,q,size(X),size(X),false,reg,nothing,nothing) diff --git a/src/context.jl b/src/context.jl index 14f1e593..f340c4fa 100644 --- a/src/context.jl +++ b/src/context.jl @@ -261,19 +261,22 @@ function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing}) filter!(refs) do ref queue = ref.value queue === nothing && return false + queue.handle == C_NULL && return false # finalized, handle destroyed push!(queues, queue) true end end end + # synchronize outside the lock: this can block for as long as a kernel runs, + # and finalizers running concurrently also need to take the lock. Keep + # finalizers disabled so none of the collected queues can be destroyed + # between collection and synchronization. + for queue in queues + oneL0.synchronize(queue) + end finally GC.enable_finalizers(true) end - # synchronize outside the lock: this can block for as long as a kernel runs, - # and finalizers running concurrently also need to take the lock. - for queue in queues - oneL0.synchronize(queue) - end return end From 561628575ca3ea8b6c17c20d82d2a1047f7958a1 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 12 Jun 2026 13:46:11 +0000 Subject: [PATCH 07/12] Make host USM resident and harden queue lifetime on the LTS stack Two distinct GPU pagefault crashes on the Aurora LTS NEO stack (25.18), both surfacing as a banned context -> ZE_RESULT_ERROR_UNKNOWN at an innocent later op: 1. Host-buffer residency (the array-test crash). HostBuffer-backed oneArrays were never made resident on the device, unlike DeviceBuffer/SharedBuffer. A kernel reading a non-resident host buffer intermittently takes a NotPresent pagefault under GC/alloc churn. Isolated with a minimal read/noread/readsync loop: the fault is at the GPU *read*, not the free, and is not curable by any synchronization. Fix: make_resident in allocate(::HostBuffer) (src/pool.jl) and for fill!'s host pattern buffer (src/array.jl). 2. Device-buffer free under a dead task's queue. global_queue is task-local, so a test file's task can die with work still in flight; the queue was destroyed without draining and a WeakRef registry hid it from the pre-free synchronize, so a later free raced the in-flight kernel. Fix: queue finalizer drains before destroy (lib/level-zero/cmdqueue.jl); the queue registry holds strong queue refs keyed by a weak owning-task ref and retires drained dead-task queues (src/context.jl). Validated with single-process reproducers and the full array test file (30/30). Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/level-zero/cmdqueue.jl | 6 ++++++ src/array.jl | 7 +++++-- src/context.jl | 40 ++++++++++++++++++++++++++++++-------- src/pool.jl | 10 +++++++++- 4 files changed, 52 insertions(+), 11 deletions(-) diff --git a/lib/level-zero/cmdqueue.jl b/lib/level-zero/cmdqueue.jl index e123038a..bdefeb44 100644 --- a/lib/level-zero/cmdqueue.jl +++ b/lib/level-zero/cmdqueue.jl @@ -20,6 +20,12 @@ mutable struct ZeCommandQueue zeCommandQueueCreate(ctx, dev, desc_ref, handle_ref) obj = new(handle_ref[], ctx, dev, ordinal) finalizer(obj) do obj + # the queue may still have work in flight (nothing requires a task to + # synchronize before dying), and zeCommandQueueDestroy does not wait for + # it: on the LTS NEO stack the still-running work then faults as soon as + # a referenced allocation is freed, getting the context banned. drain the + # queue first; unchecked, as sync on a banned context returns an error. + unchecked_zeCommandQueueSynchronize(obj, typemax(UInt64)) zeCommandQueueDestroy(obj) # mark the queue as destroyed: it can still be weakly reachable (e.g. from # the queue registry used by `synchronize_all_queues`), and synchronizing a diff --git a/src/array.jl b/src/array.jl index 3a8ffddb..0e79dde5 100644 --- a/src/array.jl +++ b/src/array.jl @@ -526,9 +526,12 @@ function Base.fill!(A::oneDenseArray{T}, val) where T val = convert(T, val) sizeof(T) == 0 && return A - # execute! is async, so we need to allocate the pattern in USM memory - # and keep it alive until the operation completes. + # execute! is async, so we need to allocate the pattern in USM memory and keep it alive + # until the operation completes. The fill reads this host buffer on the GPU, so it must + # be made resident on the device — a non-resident host buffer read by a kernel can take + # a NotPresent pagefault on the LTS NEO stack (see `allocate(::HostBuffer, ...)`). buf = oneL0.host_alloc(context(A), sizeof(T), Base.datatype_alignment(T)) + oneL0.make_resident(context(A), device(), buf) unsafe_store!(convert(Ptr{T}, buf), val) unsafe_fill!(context(A), device(), pointer(A), convert(ZePtr{T}, buf), length(A)) synchronize(global_queue(context(A), device())) diff --git a/src/context.jl b/src/context.jl index f340c4fa..9150b367 100644 --- a/src/context.jl +++ b/src/context.jl @@ -231,7 +231,8 @@ function global_queue(ctx::ZeContext, dev::ZeDevice) GC.enable_finalizers(false) try @lock queue_registry_lock begin - push!(get!(Vector{WeakRef}, queue_registry, (ctx, dev)), WeakRef(queue)) + push!(get!(Vector{Tuple{WeakRef,ZeCommandQueue}}, queue_registry, (ctx, dev)), + (WeakRef(current_task()), queue)) end finally GC.enable_finalizers(true) @@ -244,26 +245,37 @@ end # freed from any task (GC finalizers), so `release` needs to be able to find the queues # that may still have work in flight referencing the buffer; queues themselves are # cached task-locally and would otherwise be unreachable from the finalizing task. +# +# Entries reference the queue *strongly*: the GC clears WeakRefs to a dead queue in the +# same cycle that queues its finalizer, i.e., before the finalizer runs, so a WeakRef +# would hide the queue from `release` exactly when its in-flight work still references +# buffers about to be freed. The owning task is tracked weakly instead: queues are +# task-local, so once their task is dead no new work can reach them, and the entry can +# be dropped (allowing the queue to be finalized) after a final synchronize. const queue_registry_lock = ReentrantLock() -const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{WeakRef}}() +const queue_registry = Dict{Tuple{ZeContext,ZeDevice},Vector{Tuple{WeakRef,ZeCommandQueue}}}() # synchronize all known queues that target the given context (and device, if specified), # i.e., all queues whose in-flight work could possibly reference an allocation that is # about to be freed. function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing}) queues = ZeCommandQueue[] + stale = Tuple{WeakRef,ZeCommandQueue}[] GC.enable_finalizers(false) try @lock queue_registry_lock begin - for ((qctx, qdev), refs) in queue_registry + for ((qctx, qdev), entries) in queue_registry qctx == ctx || continue (dev === nothing || qdev == dev) || continue - filter!(refs) do ref - queue = ref.value - queue === nothing && return false - queue.handle == C_NULL && return false # finalized, handle destroyed + for entry in entries + (task, queue) = entry + queue.handle == C_NULL && continue # finalized, handle destroyed push!(queues, queue) - true + # entries whose task was already dead at this point cannot + # receive new work, so they are safe to retire after the sync + if task.value === nothing || istaskdone(task.value::Task) + push!(stale, entry) + end end end end @@ -274,6 +286,18 @@ function synchronize_all_queues(ctx::ZeContext, dev::Union{ZeDevice,Nothing}) for queue in queues oneL0.synchronize(queue) end + # retire drained queues of dead tasks, allowing them to be finalized (the + # finalizer synchronizes once more before destroying the queue, in case + # the queue is dropped through other means). + if !isempty(stale) + @lock queue_registry_lock begin + for ((qctx, qdev), entries) in queue_registry + qctx == ctx || continue + (dev === nothing || qdev == dev) || continue + filter!(entry -> !any(s -> s === entry, stale), entries) + end + end + end finally GC.enable_finalizers(true) end diff --git a/src/pool.jl b/src/pool.jl index e541d26e..21f1a822 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -55,7 +55,15 @@ end function allocate(::Type{oneL0.HostBuffer}, ctx, dev, bytes::Int, alignment::Int) bytes == 0 && return oneL0.HostBuffer(ZE_NULL, bytes, ctx) - host_alloc(ctx, bytes, alignment) + buf = host_alloc(ctx, bytes, alignment) + # Host USM must be made resident on the device, exactly like device/shared + # allocations. On the Aurora LTS NEO stack (25.18), a GPU kernel that reads a + # non-resident host buffer intermittently takes a NotPresent pagefault (banning the + # context), even though host USM is nominally accessible — see `repro_host_minimal.jl` + # (a kernel reading a host-backed array under GC churn faults; the same pattern with + # device/shared buffers, which are already made resident, does not). + make_resident(ctx, dev, buf) + return buf end function release(buf::oneL0.AbstractBuffer) From 3fca61160740902fe0f0b8c48250346c4a9079c4 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 12 Jun 2026 14:00:34 +0000 Subject: [PATCH 08/12] Add opt-in per-submission synchronize for the LTS dropped-tail workaround Under heavy multi-process oversubscription of a single tile on the Aurora LTS NEO stack, a whole-queue zeCommandQueueSynchronize does not reliably retire the tail of an earlier separately-submitted command list, producing silent "dropped tail" corruption (e.g. the gpuarrays/broadcasting `A .* ET(10)` mismatch; see ISSUE_dropped_tail.md). Synchronizing after every submission eliminates it. This is off by default (it costs ~3x throughput) and enabled with ONEAPI_SYNC_EACH_SUBMISSION=1, at the single submission chokepoint execute!(queue, lists, fence). Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/level-zero/cmdlist.jl | 17 +++++++++++++++-- lib/level-zero/oneL0.jl | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/level-zero/cmdlist.jl b/lib/level-zero/cmdlist.jl index 133d2551..24d1d3a5 100644 --- a/lib/level-zero/cmdlist.jl +++ b/lib/level-zero/cmdlist.jl @@ -47,8 +47,21 @@ function ZeCommandList(f::Base.Callable, args...; kwargs...) return list end -execute!(queue::ZeCommandQueue, lists::Vector{ZeCommandList}, fence=nothing) = - zeCommandQueueExecuteCommandLists(queue, length(lists), lists, something(fence, C_NULL)) +# Opt-in workaround for the Aurora LTS NEO stack (set ONEAPI_SYNC_EACH_SUBMISSION=1). +# Under heavy multi-process oversubscription of a single tile, a whole-queue +# `zeCommandQueueSynchronize` does not reliably retire the tail of an earlier, +# separately-submitted command list — producing silent "dropped tail" corruption (the +# last work-item of a kernel / last element of a copy is missing). See +# ISSUE_dropped_tail.md. Synchronizing after *every* submission eliminates it, at a large +# throughput cost (~3x), so it is off by default and only enabled when correctness under +# oversubscription matters more than speed. +const sync_each_submission = Ref{Bool}(false) + +function execute!(queue::ZeCommandQueue, lists::Vector{ZeCommandList}, fence=nothing) + r = zeCommandQueueExecuteCommandLists(queue, length(lists), lists, something(fence, C_NULL)) + sync_each_submission[] && synchronize(queue) + return r +end """ execute!(queue::ZeCommandQueue, ...) do list diff --git a/lib/level-zero/oneL0.jl b/lib/level-zero/oneL0.jl index 6bcb17fb..e80c8939 100644 --- a/lib/level-zero/oneL0.jl +++ b/lib/level-zero/oneL0.jl @@ -195,6 +195,7 @@ function __init__() validation_layer[] = parse(Bool, get(ENV, "ZE_ENABLE_VALIDATION_LAYER", "false")) parameter_validation[] = parse(Bool, get(ENV, "ZE_ENABLE_PARAMETER_VALIDATION", "false")) + sync_each_submission[] = lowercase(get(ENV, "ONEAPI_SYNC_EACH_SUBMISSION", "")) in ("1", "true", "yes") end end From 991d29e0f3b19e5ecc9bb49c794d194d127782dd Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 12 Jun 2026 14:23:18 +0000 Subject: [PATCH 09/12] Add opt-in GPU spreading for the parallel test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ONEAPI_TEST_SPREAD_GPUS=1 pins each test worker process to a distinct GPU via ZE_AFFINITY_MASK (claimed round-robin through an atomic mkdir counter, set before `using oneAPI` so the Level Zero driver picks it up at init). This spreads the suite across all tiles instead of oversubscribing device 0. device() is task-local and Malt runs each test in a fresh task, so a device! in init_worker_code would not stick — process-level pinning is the robust approach. Default (unset) keeps every worker on the first device, preserving single-tile oversubscription which is useful for surfacing contention bugs. Verified: 6 concurrent claimers -> 6 distinct device UUIDs; real harness with --jobs=4 spreads cleanly (SUCCESS). Co-Authored-By: Claude Opus 4.8 (1M context) --- test/runtests.jl | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index b7015787..27b97776 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,7 +24,45 @@ end args = parse_args(ARGS) +# Optional: spread test workers across all available GPUs, one worker per device +# (round-robin), by pinning each worker *process* to a device with ZE_AFFINITY_MASK. +# `device()` is task-local and Malt runs each test in a fresh task, so a `device!` in +# `init_worker_code` would not stick — pinning the process via the driver is the robust +# way to make every task on a worker use the same GPU. +# +# Enabled with ONEAPI_TEST_SPREAD_GPUS=1. When unset (the default) every worker stays on +# the first device, which oversubscribes a single tile — useful for surfacing +# contention/oversubscription bugs. +const spread_gpus = lowercase(get(ENV, "ONEAPI_TEST_SPREAD_GPUS", "")) in ("1", "true", "yes") +worker_env = Vector{Pair{String, String}}() +device_claim_code = :() +if spread_gpus + ndev = length(oneAPI.devices()) + # shared, node-local directory used as an atomic round-robin counter (mkdir is atomic) + devdir = mktempdir(; prefix = "oneapi_test_gpus_") + push!(worker_env, "ONEAPI_TEST_DEVDIR" => devdir) + push!(worker_env, "ONEAPI_TEST_NDEV" => string(ndev)) + @info "Spreading test workers across $ndev GPU(s) via ZE_AFFINITY_MASK (ONEAPI_TEST_SPREAD_GPUS=1)" + # NOTE: runs on the worker as the very first thing, before `using oneAPI` — so the + # Level Zero driver picks up ZE_AFFINITY_MASK at init and the process sees only its tile. + device_claim_code = quote + let dir = ENV["ONEAPI_TEST_DEVDIR"], ndev = parse(Int, ENV["ONEAPI_TEST_NDEV"]) + i = 0 + while true + try + mkdir(joinpath(dir, string(i))) + break + catch + i += 1 + end + end + ENV["ZE_AFFINITY_MASK"] = string(i % ndev) + end + end +end + init_worker_code = quote + $device_claim_code using oneAPI, Adapt import GPUArrays @@ -105,4 +143,4 @@ init_code = quote ..@grab_output, ..@on_device, ..sink end -runtests(oneAPI, args; testsuite, init_code, init_worker_code) +runtests(oneAPI, args; testsuite, init_code, init_worker_code, env = worker_env) From d78ef78fa52237952c5d005224bf93fc179f0db4 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Fri, 12 Jun 2026 14:43:25 +0000 Subject: [PATCH 10/12] Opt into sync each sub on GH self-runner --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5bd87406..d4d9f8dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,6 +15,12 @@ jobs: continue-on-error: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Pin each parallel test worker to a distinct GPU tile instead of + # oversubscribing device 0 (see test/runtests.jl). + ONEAPI_TEST_SPREAD_GPUS: '1' + # Synchronize after every command-list submission to work around the + # Aurora LTS NEO dropped-tail corruption (see lib/level-zero/cmdlist.jl). + ONEAPI_SYNC_EACH_SUBMISSION: '1' runs-on: [self-hosted, linux, X64] strategy: matrix: From 3229d8f5bc1352d90dacc0e02932895780a883ac Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Tue, 16 Jun 2026 15:10:12 +0000 Subject: [PATCH 11/12] Fix LTS-stack silent corruption in reductions and workgroup barriers Two distinct correctness bugs surfaced as intermittent KernelAbstractions example failures (naive_transpose, histogram) on the Aurora LTS NEO/IGC stack: - mapreducedim! over a strided/transposed input (e.g. `a == transpose(b)`, `sum(transpose(x))`, `isequal`, `ishermitian`) hit the LTS IGC miscompile of non-coalesced global loads, silently corrupting the result. Detect non-contiguous inputs and materialize them to a dense array before reducing, so every global read is coalesced. (src/mapreduce.jl) - KA `@synchronize` lowered to `barrier(0)`, emitting OpControlBarrier with SequentiallyConsistent but WITHOUT the WorkgroupMemory storage-class bit, which orders no memory per the SPIR-V spec. On the LTS stack shared-local writes were not made visible across the barrier, dropping updates (the histogram example's local-atomic accumulation lost counts). Fence local+global memory in `__synchronize`, and local memory in the mapreduce reduce_group SLM tree. (src/oneAPIKernels.jl, src/mapreduce.jl) Validated on the LTS stack: kernelabstractions 2218 pass / 0 fail; gpuarrays reductions+statistics 2962/2962; histogram 0 failures over 1100 iters; strided ==/sum/isequal/ishermitian correct. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/mapreduce.jl | 27 ++++++++++++++++++++++++++- src/oneAPIKernels.jl | 10 +++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 28725c84..b2d28ef8 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -33,7 +33,12 @@ # perform a reduction d = 1 while d < items - barrier(0) + # LOCAL_MEM_FENCE: `barrier(0)` lowers to an OpControlBarrier without the + # WorkgroupMemory storage-class bit, which on the NEO/IGC LTS stack does not order the + # shared-local tree accesses across the barrier (proven by the histogram example's lost + # local-atomic updates). Fence local memory so each tree step sees the previous step's + # `shared[]` writes. See ISSUE_mapreduce_corruption.md. + barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE) index = 2 * d * (item-1) + 1 @inbounds if index <= items other_val = if index + d <= items @@ -133,12 +138,32 @@ end ## COV_EXCL_STOP +# Aurora LTS workaround: the NEO/IGC LTS stack miscompiles *strided* (non-coalesced) global +# reads inside the reduction kernel, silently corrupting results whenever an input is read +# along a non-contiguous axis (e.g. `a == transpose(b)`, `sum(transpose(x))`, `ishermitian`). +# Elementwise copies are NOT affected. `_dense_reduce_input` returns false for any input that +# reads non-contiguous memory (a transposed/permuted/strided view, or a broadcast containing +# one), so such inputs get materialized to a dense `oneArray` before reducing. See +# ISSUE_mapreduce_corruption.md and the `naive_transpose` (`a == transpose(b)`) failure. +@inline _dense_reduce_input(::oneArray) = true +@inline _dense_reduce_input(x::Base.ReshapedArray) = _dense_reduce_input(parent(x)) +@inline _dense_reduce_input(::AbstractArray) = false # Transpose/Adjoint/PermutedDims/SubArray/… +@inline _dense_reduce_input(::Any) = true # scalars/Refs/tuples carried in a broadcast +@inline _dense_reduce_input(bc::Broadcast.Broadcasted) = all(_dense_reduce_input, bc.args) + function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T}, A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing) where {F, OP, T} Base.check_reducedims(R, A) length(A) == 0 && return R # isempty(::Broadcasted) iterates + # Aurora LTS workaround (see `_dense_reduce_input` above): materialize strided inputs to a + # dense array first so every global read in the reduction kernel is coalesced. + if !_dense_reduce_input(A) + Acontig = Broadcast.materialize(Broadcast.broadcasted(f, A)) + return GPUArrays.mapreducedim!(identity, op, R, Acontig; init=init) + end + # add singleton dimensions to the output container, if needed if ndims(R) < ndims(A) dims = Base.fill_to_length(size(R), 1, Val(ndims(A))) diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl index 6e092397..138b4f8c 100644 --- a/src/oneAPIKernels.jl +++ b/src/oneAPIKernels.jl @@ -214,7 +214,15 @@ end ## Synchronization and Printing @device_override @inline function KA.__synchronize() - barrier(0) + # Fence both local and global memory across the workgroup barrier, matching CUDA + # `__syncthreads` semantics. `barrier(0)` lowers to `OpControlBarrier` with + # `SequentiallyConsistent` but WITHOUT the `WorkgroupMemory` storage-class bit, which the + # SPIR-V spec treats as ordering *no* memory — so on the NEO/IGC LTS stack shared-local + # writes (e.g. the histogram example's local-atomic accumulation) are not made visible + # before the next phase reads them, silently dropping updates. Passing + # `LOCAL_MEM_FENCE | GLOBAL_MEM_FENCE` ORs in the WorkgroupMemory/CrossWorkgroupMemory + # fence bits. See ISSUE_mapreduce_corruption.md (the barrier WorkgroupMemory-bit regression). + barrier(SPIRVIntrinsics.LOCAL_MEM_FENCE | SPIRVIntrinsics.GLOBAL_MEM_FENCE) end @device_override @inline function KA.__print(args...) From 32d11d71ef7faf9e895d784af6ed7b3a8e283771 Mon Sep 17 00:00:00 2001 From: Michel Schanen Date: Tue, 16 Jun 2026 16:10:12 +0000 Subject: [PATCH 12/12] CI: disable AVX512-FP16 host codegen on the Aurora LTS self-runner Under concurrent oneMKL load on the Aurora Sapphire Rapids nodes, Julia's native AVX512-FP16 host codegen silently miscomputes Float16 (e.g. the GPUArrays `A .* B .+ c` broadcast reference used by gpuarrays/broadcasting), failing tests even though the GPU result is correct. It is single-process-clean, MXCSR-clean, and only the native-FP16 path is affected (the Float32 path is immune). Run the test suite with `-C native,-avx512fp16`, which routes Float16 through Float32 and propagates to the Pkg.test subprocess and its parallel Malt workers via Base.julia_cmd(). `julia-actions/julia-runtest` cannot pass a cpu-target, so invoke `Pkg.test()` directly. A/B under the reproducer: native FP16 -> ~6% of checks corrupt; -avx512fp16 -> 0 / 35000. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d4d9f8dd..d3c18c65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,5 +36,13 @@ jobs: - uses: julia-actions/cache@v3 - uses: julia-actions/julia-buildpkg@latest continue-on-error: true - - uses: julia-actions/julia-runtest@latest + # Disable AVX512-FP16 host codegen on the Aurora Sapphire Rapids nodes. Under concurrent + # oneMKL load the native AVX512-FP16 path silently miscomputes *host* Float16 (e.g. the + # GPUArrays `A .* B .+ c` broadcast reference), failing tests even though the GPU result + # is correct (single-process clean; MXCSR clean; only the native-FP16 path, not Float32). + # `-C native,-avx512fp16` routes Float16 through Float32 and propagates to the Pkg.test + # subprocess and its parallel workers via Base.julia_cmd(). `julia-runtest` cannot pass a + # cpu-target, so invoke Pkg.test() directly. See repro_bcast_mkl.jl. + - name: Run tests (AVX512-FP16 disabled) continue-on-error: true + run: julia -C "native,-avx512fp16" --color=yes --project=. -e 'import Pkg; Pkg.test()'