IntelPython · vlad-perevezentsev · Jun 26, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
@@ -28,6 +28,7 @@ This release is compatible with NumPy 2.5.
 * Allowed `dpnp.take` and `dpnp.compress` to cast the result into an `out` array of a different but same-kind dtype [#2959](https://github.com/IntelPython/dpnp/pull/2959)
 * Clarified the summary in `dpnp.reshape` and `dpnp.ndarray.reshape` docstrings [#2964](https://github.com/IntelPython/dpnp/pull/2964)
 * Changed `dpnp.atleast_1d`, `dpnp.atleast_2d`, `dpnp.atleast_3d`, and `dpnp.ogrid` to return a tuple of arrays instead of a list [#2965](https://github.com/IntelPython/dpnp/pull/2965)
+* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939)
 
 ### Deprecated
 

@@ -408,12 +408,28 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
         a = dpnp.reshape(a, local_shape)
         index = 1
 
+    if not a.flags.c_contiguous:
         # cuFFT requires input arrays to be C-contiguous (row-major)
         # for correct execution
-        if (
-            dpnp.is_cuda_backend(a) and not a.flags.c_contiguous
-        ):  # pragma: no cover
+        if dpnp.is_cuda_backend(a):  # pragma: no cover
             a = dpnp.ascontiguousarray(a)
+        else:
+            # Check if the memory footprint of the strides exceeds
+            # the number of elements.
+            # If so, copy to contiguous to avoid oversized allocation
+            # for the output array and unnecessary copy to contiguous
+            # after oneMKL FFT
+            # Max element displacement reachable by positive strides.
+            # Negative strides are handled by _copy_array;
+            # zero strides are safely ignored as they reuse the same
+            # memory location and don't extend the footprint
+            max_disp = sum(
+                st * (sh - 1)
+                for st, sh in zip(dpnp.get_usm_ndarray(a).strides, a.shape)
+                if st > 0
+            )
+            if (max_disp + 1) > a.size:
+                a = dpnp.ascontiguousarray(a)
 
     # w/a for cuFFT to avoid "Invalid strides" error when
     # the last dimension is 1 and there are multiple axes
@@ -424,8 +440,9 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
     if cufft_wa:  # pragma: no cover
         a = dpnp.moveaxis(a, -1, -2)
 
-    strides = dpnp.get_usm_ndarray(a).strides
-    a_strides = _standardize_strides_to_nonzero(strides, a.shape)
+    a_strides = _standardize_strides_to_nonzero(
+        dpnp.get_usm_ndarray(a).strides, a.shape
+    )
     dsc, out_strides = _commit_descriptor(
         a, forward, in_place, c2c, a_strides, index, batch_fft
     )

@@ -234,6 +234,26 @@ def test_strided_2d(self, stride_x, stride_y):
         expected = numpy.fft.fft(a)
         assert_dtype_allclose(result, expected)
 
+    def test_non_contiguous_no_copy(self):
+        a = generate_random_numpy_array((4, 5, 6), dtype=numpy.complex64)
+        # Non-contiguous input with compact footprint (no copy needed)
+        ia = dpnp.moveaxis(dpnp.array(a), 0, -1)
+        a_np = dpnp.asnumpy(ia)
+
+        result = dpnp.fft.fft(ia)
+        expected = numpy.fft.fft(a_np)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("slc", [numpy.s_[::2, :], numpy.s_[:, ::3]])
+    def test_non_contiguous_with_copy(self, slc):
+        # Strided input with oversized footprint (triggers copy)
+        a = generate_random_numpy_array((10, 12), dtype=numpy.complex64)
+        ia = dpnp.array(a)[slc]
+
+        result = dpnp.fft.fft(ia)
+        expected = numpy.fft.fft(a[slc])
+        assert_dtype_allclose(result, expected)
+
     def test_empty_array(self):
         a = numpy.empty((10, 0, 4), dtype=numpy.complex64)
         ia = dpnp.array(a)