diff --git a/CHANGELOG.md b/CHANGELOG.md index a9dc748fe34c..842f5035caa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ This release is compatible with NumPy 2.5. * Allowed `dpnp.take` and `dpnp.compress` to cast the result into an `out` array of a different but same-kind dtype [#2959](https://github.com/IntelPython/dpnp/pull/2959) * Clarified the summary in `dpnp.reshape` and `dpnp.ndarray.reshape` docstrings [#2964](https://github.com/IntelPython/dpnp/pull/2964) * Changed `dpnp.atleast_1d`, `dpnp.atleast_2d`, `dpnp.atleast_3d`, and `dpnp.ogrid` to return a tuple of arrays instead of a list [#2965](https://github.com/IntelPython/dpnp/pull/2965) +* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939) ### Deprecated diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 733436ab9887..6d4ffc9ed12b 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -408,12 +408,28 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): a = dpnp.reshape(a, local_shape) index = 1 + if not a.flags.c_contiguous: # cuFFT requires input arrays to be C-contiguous (row-major) # for correct execution - if ( - dpnp.is_cuda_backend(a) and not a.flags.c_contiguous - ): # pragma: no cover + if dpnp.is_cuda_backend(a): # pragma: no cover a = dpnp.ascontiguousarray(a) + else: + # Check if the memory footprint of the strides exceeds + # the number of elements. + # If so, copy to contiguous to avoid oversized allocation + # for the output array and unnecessary copy to contiguous + # after oneMKL FFT + # Max element displacement reachable by positive strides. + # Negative strides are handled by _copy_array; + # zero strides are safely ignored as they reuse the same + # memory location and don't extend the footprint + max_disp = sum( + st * (sh - 1) + for st, sh in zip(dpnp.get_usm_ndarray(a).strides, a.shape) + if st > 0 + ) + if (max_disp + 1) > a.size: + a = dpnp.ascontiguousarray(a) # w/a for cuFFT to avoid "Invalid strides" error when # the last dimension is 1 and there are multiple axes @@ -424,8 +440,9 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): if cufft_wa: # pragma: no cover a = dpnp.moveaxis(a, -1, -2) - strides = dpnp.get_usm_ndarray(a).strides - a_strides = _standardize_strides_to_nonzero(strides, a.shape) + a_strides = _standardize_strides_to_nonzero( + dpnp.get_usm_ndarray(a).strides, a.shape + ) dsc, out_strides = _commit_descriptor( a, forward, in_place, c2c, a_strides, index, batch_fft ) diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py index f8cc95a7a3ca..b86697147244 100644 --- a/dpnp/tests/test_fft.py +++ b/dpnp/tests/test_fft.py @@ -234,6 +234,26 @@ def test_strided_2d(self, stride_x, stride_y): expected = numpy.fft.fft(a) assert_dtype_allclose(result, expected) + def test_non_contiguous_no_copy(self): + a = generate_random_numpy_array((4, 5, 6), dtype=numpy.complex64) + # Non-contiguous input with compact footprint (no copy needed) + ia = dpnp.moveaxis(dpnp.array(a), 0, -1) + a_np = dpnp.asnumpy(ia) + + result = dpnp.fft.fft(ia) + expected = numpy.fft.fft(a_np) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("slc", [numpy.s_[::2, :], numpy.s_[:, ::3]]) + def test_non_contiguous_with_copy(self, slc): + # Strided input with oversized footprint (triggers copy) + a = generate_random_numpy_array((10, 12), dtype=numpy.complex64) + ia = dpnp.array(a)[slc] + + result = dpnp.fft.fft(ia) + expected = numpy.fft.fft(a[slc]) + assert_dtype_allclose(result, expected) + def test_empty_array(self): a = numpy.empty((10, 0, 4), dtype=numpy.complex64) ia = dpnp.array(a)