diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7bd7e4b..544df00 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,8 +11,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 * Replaced `fwd_scale` parameter with `norm` in `mkl_fft` [gh-189](https://github.com/IntelPython/mkl_fft/pull/189)
+* Dropped support for `scipy.fftpack` interface [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
+* Dropped support for `overwrite_x` parameter in `mkl_fft` [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
 
 ### Fixed
+* Fixed a bug for N-D FFTs when both `s` and `out` are given [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
+* Fixed a bug for a case when a repeated indices is passed for axes keyword in N-dimensional FFT [gh-215](https://github.com/IntelPython/mkl_fft/pull/215)
 
 ## [2.0.0] - 2025-06-03
 
@@ -27,8 +31,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * SciPy interface `mkl_fft.interfaces.scipy_fft` uses the same function from SciPy for handling `s` and `axes` for N-D FFTs [gh-181](https://github.com/IntelPython/mkl_fft/pull/181)
 
 ### Fixed
-* Fixed an issue for calling `mkl_fft.interfaces.numpy.fftn` with an empty axes [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
-* Fixed an issue for calling `mkl_fft.interfaces.numpy.fftn` with a zero-size array [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
+* Fixed a bug in `mkl_fft.interfaces.numpy.fftn` when an empty tuple is passed for `axes` [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
+* Fixed a bug for a case when a zero-size array is passed to `mkl_fft.interfaces.numpy.fftn` [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
 * Fixed inconsistency of input and output arrays dtype for `irfft` function [gh-180](https://github.com/IntelPython/mkl_fft/pull/180)
 * Fixed issues with `set_workers` function in SciPy interface `mkl_fft.interfaces.scipy_fft` [gh-183](https://github.com/IntelPython/mkl_fft/pull/183)
 
diff --git a/README.md b/README.md
index ddbb389..769aed7 100644
--- a/README.md
+++ b/README.md
@@ -51,11 +51,11 @@ While using the interfaces module is the recommended way to leverage `mk_fft`, o
 
 ### complex-to-complex (c2c) transforms:
 
-`fft(x, n=None, axis=-1, overwrite_x=False, norm=None, out=None)` - 1D FFT, similar to `scipy.fft.fft`
+`fft(x, n=None, axis=-1, norm=None, out=None)` - 1D FFT, similar to `scipy.fft.fft`
 
-`fft2(x, s=None, axes=(-2, -1), overwrite_x=False, norm=None, out=None)` - 2D FFT, similar to `scipy.fft.fft2`
+`fft2(x, s=None, axes=(-2, -1), norm=None, out=None)` - 2D FFT, similar to `scipy.fft.fft2`
 
-`fftn(x, s=None, axes=None, overwrite_x=False, norm=None, out=None)` - ND FFT, similar to `scipy.fft.fftn`
+`fftn(x, s=None, axes=None, norm=None, out=None)` - ND FFT, similar to `scipy.fft.fftn`
 
 and similar inverse FFT (`ifft*`) functions.
 
diff --git a/mkl_fft/__init__.py b/mkl_fft/__init__.py
index 5cfada7..04586ea 100644
--- a/mkl_fft/__init__.py
+++ b/mkl_fft/__init__.py
@@ -39,7 +39,6 @@
     rfft2,
     rfftn,
 )
-from ._pydfti import irfftpack, rfftpack  # pylint: disable=no-name-in-module
 from ._version import __version__
 
 import mkl_fft.interfaces  # isort: skip
@@ -51,8 +50,6 @@
     "ifft2",
     "fftn",
     "ifftn",
-    "rfftpack",
-    "irfftpack",
     "rfft",
     "irfft",
     "rfft2",
diff --git a/mkl_fft/_fft_utils.py b/mkl_fft/_fft_utils.py
index 4a1f1fc..93ba03c 100644
--- a/mkl_fft/_fft_utils.py
+++ b/mkl_fft/_fft_utils.py
@@ -44,23 +44,19 @@ def _check_norm(norm):
         )
 
 
-def _check_shapes_for_direct(xs, shape, axes):
+def _check_shapes_for_direct(s, shape, axes):
     if len(axes) > 7:  # Intel MKL supports up to 7D
         return False
-    if not (len(xs) == len(shape)):
-        # full-dimensional transform
+    if len(s) != len(shape):
+        # not a full-dimensional transform
         return False
-    if not (len(set(axes)) == len(axes)):
+    if len(set(axes)) != len(axes):
         # repeated axes
         return False
-    for xsi, ai in zip(xs, axes):
-        try:
-            sh_ai = shape[ai]
-        except IndexError:
-            raise ValueError("Invalid axis (%d) specified" % ai)
-
-        if not (xsi == sh_ai):
-            return False
+    new_shape = tuple(shape[ax] for ax in axes)
+    if tuple(s) != new_shape:
+        # trimming or padding is needed
+        return False
     return True
 
 
@@ -78,30 +74,6 @@ def _compute_fwd_scale(norm, n, shape):
         return np.sqrt(fsc)
 
 
-def _cook_nd_args(a, s=None, axes=None, invreal=False):
-    if s is None:
-        shapeless = True
-        if axes is None:
-            s = list(a.shape)
-        else:
-            try:
-                s = [a.shape[i] for i in axes]
-            except IndexError:
-                # fake s designed to trip the ValueError further down
-                s = range(len(axes) + 1)
-                pass
-    else:
-        shapeless = False
-    s = list(s)
-    if axes is None:
-        axes = list(range(-len(s), 0))
-    if len(s) != len(axes):
-        raise ValueError("Shape and axes have different lengths.")
-    if invreal and shapeless:
-        s[-1] = (a.shape[axes[-1]] - 1) * 2
-    return s, axes
-
-
 # copied from scipy.fft module
 # https://github.com/scipy/scipy/blob/main/scipy/fft/_pocketfft/helper.py
 def _datacopied(arr, original):
@@ -129,89 +101,7 @@ def _flat_to_multi(ind, shape):
     return m_ind
 
 
-# copied from scipy.fftpack.helper
-def _init_nd_shape_and_axes(x, shape, axes):
-    """Handle shape and axes arguments for n-dimensional transforms.
-    Returns the shape and axes in a standard form, taking into account negative
-    values and checking for various potential errors.
-    Parameters
-    ----------
-    x : array_like
-        The input array.
-    shape : int or array_like of ints or None
-        The shape of the result.  If both `shape` and `axes` (see below) are
-        None, `shape` is ``x.shape``; if `shape` is None but `axes` is
-        not None, then `shape` is ``scipy.take(x.shape, axes, axis=0)``.
-        If `shape` is -1, the size of the corresponding dimension of `x` is
-        used.
-    axes : int or array_like of ints or None
-        Axes along which the calculation is computed.
-        The default is over all axes.
-        Negative indices are automatically converted to their positive
-        counterpart.
-    Returns
-    -------
-    shape : array
-        The shape of the result. It is a 1D integer array.
-    axes : array
-        The shape of the result. It is a 1D integer array.
-    """
-    x = np.asarray(x)
-    noshape = shape is None
-    noaxes = axes is None
-
-    if noaxes:
-        axes = np.arange(x.ndim, dtype=np.intc)
-    else:
-        axes = np.atleast_1d(axes)
-
-    if axes.size == 0:
-        axes = axes.astype(np.intc)
-
-    if not axes.ndim == 1:
-        raise ValueError("when given, axes values must be a scalar or vector")
-    if not np.issubdtype(axes.dtype, np.integer):
-        raise ValueError("when given, axes values must be integers")
-
-    axes = np.where(axes < 0, axes + x.ndim, axes)
-
-    if axes.size != 0 and (axes.max() >= x.ndim or axes.min() < 0):
-        raise ValueError("axes exceeds dimensionality of input")
-    if axes.size != 0 and np.unique(axes).shape != axes.shape:
-        raise ValueError("all axes must be unique")
-
-    if not noshape:
-        shape = np.atleast_1d(shape)
-    elif np.isscalar(x):
-        shape = np.array([], dtype=np.intc)
-    elif noaxes:
-        shape = np.array(x.shape, dtype=np.intc)
-    else:
-        shape = np.take(x.shape, axes)
-
-    if shape.size == 0:
-        shape = shape.astype(np.intc)
-
-    if shape.ndim != 1:
-        raise ValueError("when given, shape values must be a scalar or vector")
-    if not np.issubdtype(shape.dtype, np.integer):
-        raise ValueError("when given, shape values must be integers")
-    if axes.shape != shape.shape:
-        raise ValueError(
-            "when given, axes and shape arguments have to be of the same length"
-        )
-
-    shape = np.where(shape == -1, np.array(x.shape)[axes], shape)
-    if shape.size != 0 and (shape < 1).any():
-        raise ValueError(f"invalid number of data points ({shape}) specified")
-
-    return shape, axes
-
-
 def _iter_complementary(x, axes, func, kwargs, result):
-    if axes is None:
-        # s and axes are None, direct N-D FFT
-        return func(x, **kwargs, out=result)
     x_shape = x.shape
     nd = x.ndim
     r = list(range(nd))
@@ -258,23 +148,40 @@ def _iter_fftnd(
     axes=None,
     out=None,
     direction=+1,
-    overwrite_x=False,
-    scale_function=lambda n, ind: 1.0,
+    scale_function=lambda ind: 1.0,
 ):
-    a = np.asarray(a)
-    s, axes = _init_nd_shape_and_axes(a, s, axes)
-    ovwr = overwrite_x
-    for ii in reversed(range(len(axes))):
+    # Combine the two, but in reverse, to end with the first axis given.
+    axes_and_s = list(zip(axes, s))[::-1]
+    # We try to use in-place calculations where possible, which is
+    # everywhere except when the size changes after the first FFT.
+    size_changes = [axis for axis, n in axes_and_s[1:] if a.shape[axis] != n]
+
+    # If there are any size changes, we cannot use out
+    res = None if size_changes else out
+    for ind, (axis, n) in enumerate(axes_and_s):
+        if axis in size_changes:
+            if axis == size_changes[-1]:
+                # Last size change, so any output should now be OK
+                # (an error will be raised if not), and if no output is
+                # required, we want a freshly allocated array of the right size.
+                res = out
+            elif res is not None and n < res.shape[axis]:
+                # For an intermediate step where we return fewer elements, we
+                # can use a smaller view of the previous array.
+                res = res[(slice(None),) * axis + (slice(n),)]
+            else:
+                # If we need more elements, we cannot use res.
+                res = None
         a = _c2c_fft1d_impl(
             a,
-            n=s[ii],
-            axis=axes[ii],
-            overwrite_x=ovwr,
+            n=n,
+            axis=axis,
             direction=direction,
-            fsc=scale_function(s[ii], ii),
-            out=out,
+            fsc=scale_function(ind),
+            out=res,
         )
-        ovwr = True
+        # Default output for next iteration.
+        res = a
     return a
 
 
@@ -289,13 +196,14 @@ def _output_dtype(dt):
 def _pad_array(arr, s, axes):
     """Pads array arr with zeros to attain shape s associated with axes"""
     arr_shape = arr.shape
+    new_shape = tuple(arr_shape[ax] for ax in axes)
+    if tuple(s) == new_shape:
+        return arr
+
     no_padding = True
     pad_widths = [(0, 0)] * len(arr_shape)
     for si, ai in zip(s, axes):
-        try:
-            shp_i = arr_shape[ai]
-        except IndexError:
-            raise ValueError(f"Invalid axis {ai} specified")
+        shp_i = arr_shape[ai]
         if si > shp_i:
             no_padding = False
             pad_widths[ai] = (0, si - shp_i)
@@ -325,14 +233,14 @@ def _trim_array(arr, s, axes):
     """
 
     arr_shape = arr.shape
+    new_shape = tuple(arr_shape[ax] for ax in axes)
+    if tuple(s) == new_shape:
+        return arr
+
     no_trim = True
     ind = [slice(None, None, None)] * len(arr_shape)
     for si, ai in zip(s, axes):
-        try:
-            shp_i = arr_shape[ai]
-        except IndexError:
-            raise ValueError(f"Invalid axis {ai} specified")
-        if si < shp_i:
+        if si < arr_shape[ai]:
             no_trim = False
             ind[ai] = slice(None, si, None)
     if no_trim:
@@ -356,7 +264,6 @@ def _c2c_fftnd_impl(
     x,
     s=None,
     axes=None,
-    overwrite_x=False,
     direction=+1,
     fsc=1.0,
     out=None,
@@ -364,16 +271,11 @@ def _c2c_fftnd_impl(
     if direction not in [-1, +1]:
         raise ValueError("Direction of FFT should +1 or -1")
 
+    x = np.asarray(x)
     valid_dtypes = [np.complex64, np.complex128, np.float32, np.float64]
     # _direct_fftnd requires complex type, and full-dimensional transform
-    if isinstance(x, np.ndarray) and x.size != 0 and x.ndim > 1:
-        _direct = s is None and axes is None
-        if _direct:
-            _direct = x.ndim <= 7  # Intel MKL only supports FFT up to 7D
-        if not _direct:
-            xs, xa = _cook_nd_args(x, s, axes)
-            if _check_shapes_for_direct(xs, x.shape, xa):
-                _direct = True
+    if x.size != 0 and x.ndim > 1:
+        _direct = _check_shapes_for_direct(s, x.shape, axes)
         _direct = _direct and x.dtype in valid_dtypes
     else:
         _direct = False
@@ -381,29 +283,33 @@ def _c2c_fftnd_impl(
     if _direct:
         return _direct_fftnd(
             x,
-            overwrite_x=overwrite_x,
             direction=direction,
             fsc=fsc,
             out=out,
         )
     else:
-        if s is None and x.dtype in valid_dtypes:
-            x = np.asarray(x)
+        new_shape = tuple(x.shape[ax] for ax in axes)
+        if (
+            tuple(s) == new_shape
+            and x.dtype in valid_dtypes
+            and len(set(axes)) == len(axes)
+        ):
             if out is None:
                 res = np.empty_like(x, dtype=_output_dtype(x.dtype))
             else:
                 _validate_out_array(out, x, _output_dtype(x.dtype))
                 res = out
 
+            # MKL is capable of doing batch N-D FFT, it is not required to
+            # manually loop over the batches as done in _iter_complementary and
+            # it is the reason for bad performance mentioned in the gh-issue-#67
+            # TODO: implement a batch N-D FFT using MKL
+            # _iter_complementary performs batches of N-D FFT
             return _iter_complementary(
                 x,
                 axes,
                 _direct_fftnd,
-                {
-                    "overwrite_x": overwrite_x,
-                    "direction": direction,
-                    "fsc": fsc,
-                },
+                {"direction": direction, "fsc": fsc},
                 res,
             )
         else:
@@ -414,97 +320,110 @@ def _c2c_fftnd_impl(
                 axes=axes,
                 out=out,
                 direction=direction,
-                overwrite_x=overwrite_x,
-                scale_function=lambda n, i: fsc if i == 0 else 1.0,
+                scale_function=lambda i: fsc if i == 0 else 1.0,
             )
 
 
 def _r2c_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
     a = np.asarray(x)
-    no_trim = (s is None) and (axes is None)
-    s, axes = _cook_nd_args(a, s, axes)
     la = axes[-1]
     # trim array, so that rfft avoids doing unnecessary computations
-    if not no_trim:
-        a = _trim_array(a, s, axes)
+    a = _trim_array(a, s, axes)
+
+    # last axis is not included since we calculate r2c FFT separately
+    # and not in the loop
+    axes_and_s = list(zip(axes, s))[-2::-1]
+    size_changes = [axis for axis, n in axes_and_s if a.shape[axis] != n]
+    res = None if size_changes else out
+
     # r2c along last axis
-    a = _r2c_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
+    a = _r2c_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=res)
+    res = a
     if len(s) > 1:
-        if not no_trim:
+        len_axes = len(axes)
+        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
             ss = list(s)
             ss[-1] = a.shape[la]
             a = _pad_array(a, tuple(ss), axes)
-        len_axes = len(axes)
-        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
             # a series of ND c2c FFTs along last axis
             ss, aa = _remove_axis(s, axes, -1)
-            ind = [
-                slice(None, None, 1),
-            ] * len(s)
+            ind = [slice(None, None, 1)] * len(s)
             for ii in range(a.shape[la]):
                 ind[la] = ii
                 tind = tuple(ind)
                 a_inp = a[tind]
-                res = out[tind] if out is not None else None
-                a_res = _c2c_fftnd_impl(
-                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=1, out=res
-                )
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
+                res = out[tind] if out is not None else a_inp
+                _ = _c2c_fftnd_impl(a_inp, s=ss, axes=aa, direction=1, out=res)
+            if out is not None:
+                a = out
         else:
+            # another size_changes check is needed if there are repeated axes
+            # of last axis, since since FFT changes the shape along last axis
+            size_changes = [
+                axis for axis, n in axes_and_s if a.shape[axis] != n
+            ]
+
             # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes) - 2, -1, -1):
-                a = _c2c_fft1d_impl(a, s[ii], axes[ii], overwrite_x=True)
+            for axis, n in axes_and_s:
+                if axis in size_changes:
+                    if axis == size_changes[-1]:
+                        res = out
+                    elif res is not None and n < res.shape[axis]:
+                        res = res[(slice(None),) * axis + (slice(n),)]
+                    else:
+                        res = None
+                a = _c2c_fft1d_impl(a, n, axis, out=res)
+                res = a
     return a
 
 
 def _c2r_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
     a = np.asarray(x)
-    no_trim = (s is None) and (axes is None)
-    s, axes = _cook_nd_args(a, s, axes, invreal=True)
     la = axes[-1]
-    if not no_trim:
-        a = _trim_array(a, s, axes)
     if len(s) > 1:
-        if not no_trim:
-            a = _pad_array(a, s, axes)
-        ovr_x = True if _datacopied(a, x) else False
         len_axes = len(axes)
         if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
+            a = _trim_array(a, s, axes)
+            a = _pad_array(a, s, axes)
             # a series of ND c2c FFTs along last axis
             # due to need to write into a, we must copy
-            if not ovr_x:
-                a = a.copy()
-                ovr_x = True
+            a = a if _datacopied(a, x) else a.copy()
             if not np.issubdtype(a.dtype, np.complexfloating):
                 # complex output will be copied to input, copy is needed
                 if a.dtype == np.float32:
                     a = a.astype(np.complex64)
                 else:
                     a = a.astype(np.complex128)
-                ovr_x = True
             ss, aa = _remove_axis(s, axes, -1)
-            ind = [
-                slice(None, None, 1),
-            ] * len(s)
+            ind = [slice(None, None, 1)] * len(s)
             for ii in range(a.shape[la]):
                 ind[la] = ii
                 tind = tuple(ind)
                 a_inp = a[tind]
                 # out has real dtype and cannot be used in intermediate steps
-                a_res = _c2c_fftnd_impl(
-                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=-1
+                # ss and aa are reversed since np.fft.irfftn uses forward order
+                # but np.fft.ifftn uses reverse order see numpy-gh-28950
+                _ = _c2c_fftnd_impl(
+                    a_inp, s=ss[::-1], axes=aa[::-1], out=a_inp, direction=-1
                 )
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
         else:
             # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes) - 1):
-                # out has real dtype and cannot be used in intermediate steps
-                a = _c2c_fft1d_impl(
-                    a, s[ii], axes[ii], overwrite_x=ovr_x, direction=-1
-                )
-                ovr_x = True
+            # forward order, see numpy-gh-28950
+            axes_and_s = list(zip(axes, s))[:-1]
+            size_changes = [
+                axis for axis, n in axes_and_s[1:] if a.shape[axis] != n
+            ]
+            # out has real dtype cannot be used for intermediate steps
+            res = None
+            for axis, n in axes_and_s:
+                if axis in size_changes:
+                    if res is not None and n < res.shape[axis]:
+                        # pylint: disable=unsubscriptable-object
+                        res = res[(slice(None),) * axis + (slice(n),)]
+                    else:
+                        res = None
+                a = _c2c_fft1d_impl(a, n, axis, out=res, direction=-1)
+                res = a
     # c2r along last axis
     a = _c2r_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
     return a
diff --git a/mkl_fft/_mkl_fft.py b/mkl_fft/_mkl_fft.py
index c0d2756..d59be8e 100644
--- a/mkl_fft/_mkl_fft.py
+++ b/mkl_fft/_mkl_fft.py
@@ -24,6 +24,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
+
 from ._fft_utils import (
     _c2c_fftnd_impl,
     _c2r_fftnd_impl,
@@ -50,78 +52,64 @@
 ]
 
 
-def fft(x, n=None, axis=-1, norm=None, out=None, overwrite_x=False):
+# copied with modifications from:
+# https://github.com/numpy/numpy/blob/main/numpy/fft/_pocketfft.py
+def _cook_nd_args(a, s=None, axes=None, invreal=False):
+    if s is None:
+        shapeless = True
+        if axes is None:
+            s = list(a.shape)
+        else:
+            s = np.take(a.shape, axes)
+    else:
+        shapeless = False
+    s = list(s)
+    if axes is None:
+        if not shapeless:
+            raise ValueError("If s is not None, axes must not be None either.")
+        axes = list(range(-len(s), 0))
+    if len(s) != len(axes):
+        raise ValueError("Shape and axes have different lengths.")
+    if invreal and shapeless:
+        s[-1] = (a.shape[axes[-1]] - 1) * 2
+    if None in s:
+        raise ValueError("s must contain only int.")
+    # use the whole input array along axis `i` if `s[i] == -1`
+    s = [a.shape[_a] if _s == -1 else _s for _s, _a in zip(s, axes)]
+
+    # make axes positive
+    axes = [ax + a.ndim if ax < 0 else ax for ax in axes]
+    return s, axes
+
+
+def fft(x, n=None, axis=-1, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
-    return _c2c_fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fsc,
-    )
-
-
-def ifft(x, n=None, axis=-1, norm=None, out=None, overwrite_x=False):
+    return _c2c_fft1d_impl(x, n=n, axis=axis, out=out, direction=+1, fsc=fsc)
+
+
+def ifft(x, n=None, axis=-1, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
-    return _c2c_fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fsc,
-    )
-
-
-def fft2(x, s=None, axes=(-2, -1), norm=None, out=None, overwrite_x=False):
-    return fftn(
-        x,
-        s=s,
-        axes=axes,
-        norm=norm,
-        out=out,
-        overwrite_x=overwrite_x,
-    )
-
-
-def ifft2(x, s=None, axes=(-2, -1), norm=None, out=None, overwrite_x=False):
-    return ifftn(
-        x,
-        s=s,
-        axes=axes,
-        norm=norm,
-        out=out,
-        overwrite_x=overwrite_x,
-    )
-
-
-def fftn(x, s=None, axes=None, norm=None, out=None, overwrite_x=False):
+    return _c2c_fft1d_impl(x, n=n, axis=axis, out=out, direction=-1, fsc=fsc)
+
+
+def fft2(x, s=None, axes=(-2, -1), norm=None, out=None):
+    return fftn(x, s=s, axes=axes, norm=norm, out=out)
+
+
+def ifft2(x, s=None, axes=(-2, -1), norm=None, out=None):
+    return ifftn(x, s=s, axes=axes, norm=norm, out=out)
+
+
+def fftn(x, s=None, axes=None, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, s, x.shape)
-    return _c2c_fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fsc,
-    )
-
-
-def ifftn(x, s=None, axes=None, norm=None, out=None, overwrite_x=False):
+    s, axes = _cook_nd_args(x, s, axes)
+    return _c2c_fftnd_impl(x, s=s, axes=axes, out=out, direction=+1, fsc=fsc)
+
+
+def ifftn(x, s=None, axes=None, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, s, x.shape)
-    return _c2c_fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fsc,
-    )
+    s, axes = _cook_nd_args(x, s, axes)
+    return _c2c_fftnd_impl(x, s=s, axes=axes, out=out, direction=-1, fsc=fsc)
 
 
 def rfft(x, n=None, axis=-1, norm=None, out=None):
@@ -144,9 +132,11 @@ def irfft2(x, s=None, axes=(-2, -1), norm=None, out=None):
 
 def rfftn(x, s=None, axes=None, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, s, x.shape)
+    s, axes = _cook_nd_args(x, s, axes)
     return _r2c_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fsc)
 
 
 def irfftn(x, s=None, axes=None, norm=None, out=None):
     fsc = _compute_fwd_scale(norm, s, x.shape)
+    s, axes = _cook_nd_args(x, s, axes, invreal=True)
     return _c2r_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fsc)
diff --git a/mkl_fft/_pydfti.pyx b/mkl_fft/_pydfti.pyx
index 20f6a94..f6bd464 100644
--- a/mkl_fft/_pydfti.pyx
+++ b/mkl_fft/_pydfti.pyx
@@ -224,13 +224,10 @@ cdef cnp.ndarray _process_arguments(
     object x,
     object n,
     object axis,
-    object overwrite_x,
-    object direction,
     long *axis_,
     long *n_,
     int *in_place,
     int *xnd,
-    int *dir_,
     int realQ,
 ):
     """
@@ -240,13 +237,6 @@ cdef cnp.ndarray _process_arguments(
     cdef long n_max = 0
     cdef cnp.ndarray x_arr "xx_arrayObject"
 
-    if direction not in [-1, +1]:
-        raise ValueError("Direction of FFT should +1 or -1")
-    else:
-        dir_[0] = -1 if direction is -1 else +1
-
-    in_place[0] = 1 if overwrite_x else 0
-
     # convert x to ndarray, ensure that strides are multiples of itemsize
     x_arr = PyArray_CheckFromAny(
         x, NULL, 0, 0,
@@ -259,8 +249,8 @@ cdef cnp.ndarray _process_arguments(
     if (<void *> x_arr) is NULL:
         raise ValueError("An input argument x is not an array-like object")
 
-    if _datacopied(x_arr, x):
-        in_place[0] = 1  # a copy was made, so we can work in place.
+    # a copy was made, so we can work in place.
+    in_place[0] = 1 if _datacopied(x_arr, x) else 0
 
     xnd[0] = cnp.PyArray_NDIM(x_arr)  # tensor-rank of the array
 
@@ -379,21 +369,13 @@ def _validate_out_array(out, x, out_dtype, axis=None, n=None):
 # float/double inputs are not cast to complex, but are effectively
 # treated as complexes with zero imaginary parts.
 # All other types are cast to complex double.
-def _c2c_fft1d_impl(
-    x,
-    n=None,
-    axis=-1,
-    overwrite_x=False,
-    direction=+1,
-    double fsc=1.0,
-    out=None,
-):
+def _c2c_fft1d_impl(x, n=None, axis=-1, direction=+1, double fsc=1.0, out=None):
     """
     Uses MKL to perform 1D FFT on the input array x along the given axis.
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, n_max = 0, in_place, dir_
+    cdef int xnd, n_max = 0, in_place
     cdef long n_, axis_
     cdef int x_type, f_type, status = 0
     cdef int ALL_HARMONICS = 1
@@ -401,9 +383,10 @@ def _c2c_fft1d_impl(
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    if direction not in [-1, +1]:
+        raise ValueError("Direction of FFT should +1 or -1")
 
+    x_arr = _process_arguments(x, n, axis, &axis_, &n_, &in_place, &xnd, 0)
     x_type = cnp.PyArray_TYPE(x_arr)
 
     if out is not None:
@@ -439,7 +422,7 @@ def _c2c_fft1d_impl(
             _cache_capsule, capsule_name
         )
         if x_type is cnp.NPY_CDOUBLE:
-            if dir_ < 0:
+            if direction < 0:
                 status = cdouble_mkl_ifft1d_in(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
@@ -448,7 +431,7 @@ def _c2c_fft1d_impl(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
         elif x_type is cnp.NPY_CFLOAT:
-            if dir_ < 0:
+            if direction < 0:
                 status = cfloat_mkl_ifft1d_in(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
@@ -497,7 +480,7 @@ def _c2c_fft1d_impl(
         )
         if f_type is cnp.NPY_CDOUBLE:
             if x_type is cnp.NPY_DOUBLE:
-                if dir_ < 0:
+                if direction < 0:
                     status = double_cdouble_mkl_ifft1d_out(
                         x_arr,
                         n_,
@@ -518,7 +501,7 @@ def _c2c_fft1d_impl(
                         _cache,
                     )
             elif x_type is cnp.NPY_CDOUBLE:
-                if dir_ < 0:
+                if direction < 0:
                     status = cdouble_cdouble_mkl_ifft1d_out(
                         x_arr, n_, <int> axis_, f_arr, fsc, _cache
                     )
@@ -528,7 +511,7 @@ def _c2c_fft1d_impl(
                     )
         else:
             if x_type is cnp.NPY_FLOAT:
-                if dir_ < 0:
+                if direction < 0:
                     status = float_cfloat_mkl_ifft1d_out(
                         x_arr,
                         n_,
@@ -549,7 +532,7 @@ def _c2c_fft1d_impl(
                         _cache,
                     )
             elif x_type is cnp.NPY_CFLOAT:
-                if dir_ < 0:
+                if direction < 0:
                     status = cfloat_cfloat_mkl_ifft1d_out(
                         x_arr, n_, <int> axis_, f_arr, fsc, _cache
                     )
@@ -571,7 +554,7 @@ def _c2c_fft1d_impl(
 
 
 def _r2c_fft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+    x, n=None, axis=-1, double fsc=1.0, out=None
 ):
     """
     Uses MKL to perform 1D FFT on the real input array x along the given axis,
@@ -581,17 +564,15 @@ def _r2c_fft1d_impl(
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
+    cdef int xnd, in_place
     cdef long n_, axis_
     cdef int x_type, f_type, status, requirement
     cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
+    x_arr = _process_arguments(x, n, axis, &axis_, &n_, &in_place, &xnd, 1)
 
     x_type = cnp.PyArray_TYPE(x_arr)
 
@@ -671,7 +652,7 @@ def _r2c_fft1d_impl(
 
 
 def _c2r_fft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+    x, n=None, axis=-1, double fsc=1.0, out=None
 ):
     """
     Uses MKL to perform 1D FFT on the real/complex input array x along the
@@ -681,10 +662,9 @@ def _c2r_fft1d_impl(
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_, int_n
+    cdef int xnd, in_place, int_n
     cdef long n_, axis_
     cdef int x_type, f_type, status
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
@@ -692,8 +672,7 @@ def _c2r_fft1d_impl(
     int_n = _is_integral(n)
     # nn gives the number elements along axis of the input that we use
     nn = (n // 2 + 1) if int_n and n > 0 else n
-    x_arr = _process_arguments(x, nn, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    x_arr = _process_arguments(x, nn, axis, &axis_, &n_, &in_place, &xnd, 0)
     n_ = 2*(n_ - 1)
     if int_n and (n % 2 == 1):
         n_ += 1
@@ -776,20 +755,13 @@ def _c2r_fft1d_impl(
 
 
 def _direct_fftnd(
-    x, overwrite_x=False, direction=+1, double fsc=1.0, out=None
+    x, direction=+1, double fsc=1.0, out=None
 ):
     """Perform n-dimensional FFT over all axes"""
     cdef int err
     cdef cnp.ndarray x_arr "xxnd_arrayObject"
     cdef cnp.ndarray f_arr "ffnd_arrayObject"
-    cdef int dir_, in_place, x_type, f_type
-
-    if direction not in [-1, +1]:
-        raise ValueError("Direction of FFT should +1 or -1")
-    else:
-        dir_ = -1 if direction is -1 else +1
-
-    in_place = 1 if overwrite_x else 0
+    cdef int in_place, x_type, f_type
 
     # convert x to ndarray, ensure that strides are multiples of itemsize
     x_arr = PyArray_CheckFromAny(
@@ -803,8 +775,8 @@ def _direct_fftnd(
     if <void *> x_arr is NULL:
         raise ValueError("An input argument x is not an array-like object")
 
-    if _datacopied(x_arr, x):
-        in_place = 1  # a copy was made, so we can work in place.
+    # a copy was made, so we can work in place.
+    in_place = 1 if _datacopied(x_arr, x) else 0
 
     x_type = cnp.PyArray_TYPE(x_arr)
     if (
@@ -834,12 +806,12 @@ def _direct_fftnd(
 
     if in_place:
         if x_type == cnp.NPY_CDOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = cdouble_cdouble_mkl_fftnd_in(x_arr, fsc)
             else:
                 err = cdouble_cdouble_mkl_ifftnd_in(x_arr, fsc)
         elif x_type == cnp.NPY_CFLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = cfloat_cfloat_mkl_fftnd_in(x_arr, fsc)
             else:
                 err = cfloat_cfloat_mkl_ifftnd_in(x_arr, fsc)
@@ -866,22 +838,22 @@ def _direct_fftnd(
                 f_arr = _allocate_result(x_arr, -1, 0, f_type)
 
         if x_type == cnp.NPY_CDOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = cdouble_cdouble_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = cdouble_cdouble_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_CFLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = cfloat_cfloat_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = cfloat_cfloat_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_DOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = double_cdouble_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = double_cdouble_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_FLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = float_cfloat_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = float_cfloat_mkl_ifftnd_out(x_arr, f_arr, fsc)
@@ -893,253 +865,3 @@ def _direct_fftnd(
             return out
         else:
             return f_arr
-
-
-# ========================= deprecated functions ==============================
-cdef object _rc_to_rr(cnp.ndarray rc_arr, int n, int axis, int xnd, int x_type):
-    cdef object res
-    inp = <object>rc_arr
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    sl_1 = list(slice_)
-    sl_1[axis] = 1
-    if (inp.flags["C"] and inp.strides[axis] == inp.itemsize):
-        res = inp
-        res = res.view(
-            dtype=np.single if x_type == cnp.NPY_FLOAT else np.double
-        )
-        res[tuple(sl_1)] = res[tuple(sl_0)]
-
-        slice_[axis] = slice(1, n + 1, None)
-
-        return res[tuple(slice_)]
-    else:
-        res_shape = list(inp.shape)
-        res_shape[axis] = n
-        res = np.empty(
-            tuple(res_shape),
-            dtype = np.single if x_type == cnp.NPY_FLOAT else np.double,
-        )
-
-        res[tuple(sl_0)] = inp[tuple(sl_0)].real
-        sl_dst_real = list(slice_)
-        sl_dst_real[axis] = slice(1, None, 2)
-        sl_src_real = list(slice_)
-        sl_src_real[axis] = slice(1, None, None)
-        res[tuple(sl_dst_real)] = inp[tuple(sl_src_real)].real
-        sl_dst_imag = list(slice_)
-        sl_dst_imag[axis] = slice(2, None, 2)
-        sl_src_imag = list(slice_)
-        sl_src_imag[axis] = slice(
-            1, inp.shape[axis] if (n & 1) else inp.shape[axis] - 1, None
-        )
-        res[tuple(sl_dst_imag)] = inp[tuple(sl_src_imag)].imag
-
-        return res[tuple(slice_)]
-
-
-cdef object _rr_to_rc(cnp.ndarray rr_arr, int n, int axis, int xnd, int x_type):
-
-    inp = <object> rr_arr
-
-    rc_shape = list(inp.shape)
-    rc_shape[axis] = (n // 2 + 1)
-    rc_shape = tuple(rc_shape)
-
-    rc_dtype = np.cdouble if x_type == cnp.NPY_DOUBLE else np.csingle
-    rc = np.empty(rc_shape, dtype=rc_dtype, order="C")
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_src_real = list(slice_)
-    sl_src_imag = list(slice_)
-    sl_src_real[axis] = slice(1, n, 2)
-    sl_src_imag[axis] = slice(2, n, 2)
-
-    sl_dest_real = list(slice_)
-    sl_dest_real[axis] = slice(1, None, None)
-    sl_dest_imag = list(slice_)
-    sl_dest_imag[axis] = slice(1, (n+1)//2, None)
-
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    rc_real = rc.real
-    rc_imag = rc.imag
-
-    rc_real[tuple(sl_dest_real)] = inp[tuple(sl_src_real)]
-    rc_imag[tuple(sl_dest_imag)] = inp[tuple(sl_src_imag)]
-    rc_real[tuple(sl_0)] = inp[tuple(sl_0)]
-    rc_imag[tuple(sl_0)] = 0
-    if (n & 1 == 0):
-        sl_last = list(slice_)
-        sl_last[axis] = -1
-        rc_imag[tuple(sl_last)] = 0
-
-    return rc
-
-
-def _rr_fft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
-    """
-    Uses MKL to perform real packed 1D FFT on the input array x
-    along the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
-
-    Functionally equivalent to scipy.fftpack.rfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
-    cdef long n_, axis_
-    cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int x_type, status, f_type
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(+1),
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        in_place = 0
-    elif x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
-        raise TypeError("1st argument must be a real sequence")
-    else:
-        try:
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE,
-                cnp.NPY_ARRAY_BEHAVED | cnp.NPY_ARRAY_ENSURECOPY
-            )
-        except:
-            raise TypeError("1st argument must be a real sequence")
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 0
-
-    f_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    f_arr = _allocate_result(x_arr, n_ // 2 + 1, axis_, f_type)
-
-    _cache_capsule = _tls_dfti_cache_capsule()
-    _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-        _cache_capsule, capsule_name
-    )
-    if x_type is cnp.NPY_DOUBLE:
-        status = double_cdouble_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-    else:
-        status = float_cfloat_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError("Internal error occurred: {}".format(py_error_msg))
-
-    # post-process and return
-    return _rc_to_rr(f_arr, n_, axis_, xnd, x_type)
-
-
-def _rr_ifft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
-    """
-    Uses MKL to perform real packed 1D FFT on the input array x along
-    the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
-
-    Functionally equivalent to scipy.fftpack.irfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
-    cdef long n_, axis_
-    cdef int x_type, rc_type, status
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(-1),
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        pass
-    else:
-        # we must cast the input and allocate the output,
-        # so we cast to complex double and operate in place
-        try:
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE,
-                cnp.NPY_ARRAY_BEHAVED | cnp.NPY_ARRAY_ENSURECOPY
-            )
-        except:
-            raise ValueError(
-                "First argument should be a real "
-                "or a complex sequence of single or double precision"
-            )
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 1
-
-    # need to convert this into complex array
-    rc_obj = _rr_to_rc(x_arr, n_, axis_, xnd, x_type)
-    rc_arr = <cnp.ndarray> rc_obj
-
-    rc_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    in_place = False
-    if in_place:
-        f_arr = x_arr
-    else:
-        f_arr = _allocate_result(x_arr, n_, axis_, x_type)
-
-    # call out-of-place FFT
-    if rc_type is cnp.NPY_CFLOAT:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cfloat_float_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    elif rc_type is cnp.NPY_CDOUBLE:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cdouble_double_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    else:
-        raise ValueError(
-            "Internal mkl_fft error occurred: Unrecognized rc_type"
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError(
-            "Internal error occurred: {}".format(str(py_error_msg))
-        )
-
-    return f_arr
-
-
-def rfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """Packed real-valued harmonics of FFT of a real sequence x"""
-    return _rr_fft1d_impl(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
-
-
-def irfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """IFFT of a real sequence, takes packed real-valued harmonics of FFT"""
-    return _rr_ifft1d_impl(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
diff --git a/mkl_fft/interfaces/_numpy_fft.py b/mkl_fft/interfaces/_numpy_fft.py
index d83fe0b..268dbc3 100644
--- a/mkl_fft/interfaces/_numpy_fft.py
+++ b/mkl_fft/interfaces/_numpy_fft.py
@@ -98,6 +98,8 @@ def _cook_nd_args(a, s=None, axes=None, invreal=False):
     # use the whole input array along axis `i` if `s[i] == -1 or None`
     s = [a.shape[_a] if _s in [-1, None] else _s for _s, _a in zip(s, axes)]
 
+    # make axes positive
+    axes = [ax + a.ndim if ax < 0 else ax for ax in axes]
     return s, axes
 
 
diff --git a/mkl_fft/interfaces/_scipy_fft.py b/mkl_fft/interfaces/_scipy_fft.py
index 405fe37..1e12644 100644
--- a/mkl_fft/interfaces/_scipy_fft.py
+++ b/mkl_fft/interfaces/_scipy_fft.py
@@ -203,6 +203,15 @@ def _init_nd_shape_and_axes(x, shape, axes, invreal=False):
     return tuple(shape), list(axes)
 
 
+def _use_input_as_out(x, overwrite_x):
+    """Check if the input can be used as output."""
+    if overwrite_x and np.issubdtype(x.dtype, np.complexfloating):
+        # pass input as out to overwrite it
+        return x
+    else:
+        return None
+
+
 def _validate_input(x):
     try:
         x = _supported_array_or_not_implemented(x)
@@ -223,11 +232,10 @@ def fft(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
 
     with _Workers(workers):
-        return mkl_fft.fft(
-            x, n=n, axis=axis, overwrite_x=overwrite_x, norm=norm
-        )
+        return mkl_fft.fft(x, n=n, axis=axis, norm=norm, out=out)
 
 
 def ifft(
@@ -241,11 +249,10 @@ def ifft(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
 
     with _Workers(workers):
-        return mkl_fft.ifft(
-            x, n=n, axis=axis, overwrite_x=overwrite_x, norm=norm
-        )
+        return mkl_fft.ifft(x, n=n, axis=axis, norm=norm, out=out)
 
 
 def fft2(
@@ -320,12 +327,11 @@ def fftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     s, axes = _init_nd_shape_and_axes(x, s, axes)
 
     with _Workers(workers):
-        return mkl_fft.fftn(
-            x, s=s, axes=axes, overwrite_x=overwrite_x, norm=norm
-        )
+        return mkl_fft.fftn(x, s=s, axes=axes, norm=norm, out=out)
 
 
 def ifftn(
@@ -346,12 +352,11 @@ def ifftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     s, axes = _init_nd_shape_and_axes(x, s, axes)
 
     with _Workers(workers):
-        return mkl_fft.ifftn(
-            x, s=s, axes=axes, overwrite_x=overwrite_x, norm=norm
-        )
+        return mkl_fft.ifftn(x, s=s, axes=axes, norm=norm, out=out)
 
 
 def rfft(
diff --git a/mkl_fft/interfaces/_scipy_fftpack.py b/mkl_fft/interfaces/_scipy_fftpack.py
deleted file mode 100644
index a4cd27d..0000000
--- a/mkl_fft/interfaces/_scipy_fftpack.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2025, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import mkl_fft
-
-from ._float_utils import _upcast_float16_array
-
-__all__ = ["fft", "ifft", "fftn", "ifftn", "fft2", "ifft2", "rfft", "irfft"]
-
-
-def fft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fft(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def ifft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifft(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def fftn(a, shape=None, axes=None, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def ifftn(a, shape=None, axes=None, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def fft2(a, shape=None, axes=(-2, -1), overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def ifft2(a, shape=None, axes=(-2, -1), overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def rfft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.rfftpack(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def irfft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.irfftpack(x, n=n, axis=axis, overwrite_x=overwrite_x)
diff --git a/mkl_fft/tests/helper.py b/mkl_fft/tests/helper.py
index 6a59664..c0903e4 100644
--- a/mkl_fft/tests/helper.py
+++ b/mkl_fft/tests/helper.py
@@ -32,3 +32,8 @@
     np.lib.NumpyVersion(np.__version__) < "2.0.0",
     reason="Requires NumPy >= 2.0.0",
 )
+
+requires_numpy_2_1 = pytest.mark.skipif(
+    np.lib.NumpyVersion(np.__version__) < "2.1.0",
+    reason="Requires NumPy >= 2.1.0",
+)
diff --git a/mkl_fft/tests/test_fft1d.py b/mkl_fft/tests/test_fft1d.py
index fbff2bd..7a1e0fd 100644
--- a/mkl_fft/tests/test_fft1d.py
+++ b/mkl_fft/tests/test_fft1d.py
@@ -124,18 +124,18 @@ def test_vector4(self):
     def test_vector5(self):
         """fft in-place is the same as fft out-of-place"""
         x = self.xz1.copy()[::-2]
-        f1 = mkl_fft.fft(x, overwrite_x=True)
+        f1 = mkl_fft.fft(x, out=x)
         f2 = mkl_fft.fft(self.xz1[::-2])
         assert_(np.allclose(f1, f2))
 
     def test_vector6(self):
         """fft in place"""
         x = self.xz1.copy()
-        f1 = mkl_fft.fft(x, overwrite_x=True)
+        f1 = mkl_fft.fft(x, out=x)
         assert_(not _datacopied(f1, x))  # this is in-place
 
         x = self.xz1.copy()
-        f1 = mkl_fft.fft(x[::-2], overwrite_x=True)
+        f1 = mkl_fft.fft(x[::-2], out=x[::-2])
         assert_(not np.allclose(x, self.xz1))  # this is also in-place
         assert_(np.allclose(x[-2::-2], self.xz1[-2::-2]))
         assert_(np.allclose(x[-1::-2], f1))
@@ -239,7 +239,7 @@ def test_matrix3(self):
     def test_matrix4(self):
         x = self.az2.copy()
         f1 = mkl_fft.fft(x[::3, ::-1])
-        f2 = mkl_fft.fft(x[::3, ::-1], overwrite_x=True)
+        f2 = mkl_fft.fft(x[::3, ::-1], out=x[::3, ::-1])
         assert_allclose(f1, f2)
 
     def test_matrix5(self):
@@ -352,55 +352,6 @@ def test_array6(self):
         assert_allclose(y1, y2, atol=2e-15)
 
 
-class Test_mklfft_rfftpack(TestCase):
-    def setUp(self):
-        rnd.seed(1234567)
-        self.v1 = rnd.randn(16)
-        self.m2 = rnd.randn(5, 7)
-        self.t3 = rnd.randn(5, 7, 11)
-
-    def test1(self):
-        x = self.v1.copy()
-        f1 = mkl_fft.rfftpack(x)
-        f2 = mkl_fft.irfftpack(f1)
-        assert_allclose(f2, x)
-
-    def test2(self):
-        x = self.v1.copy()
-        f1 = mkl_fft.irfftpack(x)
-        f2 = mkl_fft.rfftpack(f1)
-        assert_allclose(f2, x)
-
-    def test3(self):
-        for a in range(0, 2):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [2e-7, 2e-15]):
-                    x = self.m2.copy().astype(dt)
-                    f1 = mkl_fft.rfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.irfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(
-                        f2, self.m2.astype(dt), atol=atol, err_msg=(a, ovwr_x)
-                    )
-
-    def test4(self):
-        for a in range(0, 2):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [2e-7, 2e-15]):
-                    x = self.m2.copy().astype(dt)
-                    f1 = mkl_fft.irfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.rfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(f2, self.m2.astype(dt), atol=atol)
-
-    def test5(self):
-        for a in range(0, 3):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [4e-7, 4e-15]):
-                    x = self.t3.copy().astype(dt)
-                    f1 = mkl_fft.irfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.rfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(f2, self.t3.astype(dt), atol=atol)
-
-
 @requires_numpy_2
 @pytest.mark.parametrize("axis", [0, 1, 2])
 @pytest.mark.parametrize("func", ["fft", "ifft"])
diff --git a/mkl_fft/tests/test_fftnd.py b/mkl_fft/tests/test_fftnd.py
index 11f65c6..27ec322 100644
--- a/mkl_fft/tests/test_fftnd.py
+++ b/mkl_fft/tests/test_fftnd.py
@@ -31,7 +31,7 @@
 
 import mkl_fft
 
-from .helper import requires_numpy_2
+from .helper import requires_numpy_2, requires_numpy_2_1
 
 reps_64 = (2**11) * np.finfo(np.float64).eps
 reps_32 = (2**11) * np.finfo(np.float32).eps
@@ -219,8 +219,8 @@ def test_gh109():
     b_int = np.array([[5, 7, 6, 5], [4, 6, 4, 8], [9, 3, 7, 5]], dtype=np.int64)
     b = np.asarray(b_int, dtype=np.float32)
 
-    r1 = mkl_fft.fftn(b, s=None, axes=(0,), overwrite_x=False, norm="ortho")
-    r2 = mkl_fft.fftn(b_int, s=None, axes=(0,), overwrite_x=False, norm="ortho")
+    r1 = mkl_fft.fftn(b, s=None, axes=(0,), norm="ortho")
+    r2 = mkl_fft.fftn(b_int, s=None, axes=(0,), norm="ortho")
 
     rtol, atol = _get_rtol_atol(b)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -237,8 +237,29 @@ def test_s_axes(dtype, s, axes, func):
     else:
         x = np.random.random(shape)
 
-    r1 = getattr(mkl_fft, func)(x, s=s, axes=axes)
-    r2 = getattr(np.fft, func)(x, s=s, axes=axes)
+    r1 = getattr(np.fft, func)(x, s=s, axes=axes)
+    r2 = getattr(mkl_fft, func)(x, s=s, axes=axes)
+
+    rtol, atol = _get_rtol_atol(x)
+    assert_allclose(r1, r2, rtol=rtol, atol=atol)
+
+
+@requires_numpy_2
+@pytest.mark.parametrize("dtype", [complex, float])
+@pytest.mark.parametrize("s", [(15, 24, 10), [35, 25, 15], [25, 15, 5]])
+@pytest.mark.parametrize("axes", [(0, 1, 2), (-1, -2, -3), [1, 0, 2]])
+@pytest.mark.parametrize("func", ["fftn", "ifftn", "rfftn", "irfftn"])
+def test_s_axes_out(dtype, s, axes, func):
+    shape = (30, 20, 10)
+    if dtype is complex and func != "rfftn":
+        x = np.random.random(shape) + 1j * np.random.random(shape)
+    else:
+        x = np.random.random(shape)
+
+    r1 = getattr(np.fft, func)(x, s=s, axes=axes)
+    out = np.empty_like(r1)
+    r2 = getattr(mkl_fft, func)(x, s=s, axes=axes, out=out)
+    assert r2 is out
 
     rtol, atol = _get_rtol_atol(x)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -246,7 +267,7 @@ def test_s_axes(dtype, s, axes, func):
 
 @pytest.mark.parametrize("dtype", [complex, float])
 @pytest.mark.parametrize("axes", [(2, 0, 2, 0), (0, 1, 1), (2, 0, 1, 3, 2, 1)])
-@pytest.mark.parametrize("func", ["rfftn", "irfftn"])
+@pytest.mark.parametrize("func", ["fftn", "ifftn", "rfftn", "irfftn"])
 def test_repeated_axes(dtype, axes, func):
     shape = (2, 3, 4, 5)
     if dtype is complex and func != "rfftn":
@@ -254,8 +275,27 @@ def test_repeated_axes(dtype, axes, func):
     else:
         x = np.random.random(shape)
 
-    r1 = getattr(mkl_fft, func)(x, axes=axes)
-    r2 = getattr(np.fft, func)(x, axes=axes)
+    r1 = getattr(np.fft, func)(x, axes=axes)
+    r2 = getattr(mkl_fft, func)(x, axes=axes)
+
+    rtol, atol = _get_rtol_atol(x)
+    assert_allclose(r1, r2, rtol=rtol, atol=atol)
+
+
+@requires_numpy_2_1
+@pytest.mark.parametrize("dtype", [complex, float])
+@pytest.mark.parametrize("axes", [(2, 3, 3, 2), (0, 0, 3, 3)])
+@pytest.mark.parametrize("s", [(5, 4, 3, 3), (7, 8, 10, 9)])
+@pytest.mark.parametrize("func", ["fftn", "ifftn", "rfftn", "irfftn"])
+def test_repeated_axes_with_s(dtype, axes, s, func):
+    shape = (2, 3, 4, 5)
+    if dtype is complex and func != "rfftn":
+        x = np.random.random(shape) + 1j * np.random.random(shape)
+    else:
+        x = np.random.random(shape)
+
+    r1 = getattr(np.fft, func)(x, axes=axes, s=s)
+    r2 = getattr(mkl_fft, func)(x, axes=axes, s=s)
 
     rtol, atol = _get_rtol_atol(x)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -274,4 +314,4 @@ def test_out_strided(axes, func):
     result = getattr(mkl_fft, func)(x, axes=axes, out=out)
     expected = getattr(np.fft, func)(x, axes=axes, out=out)
 
-    assert_allclose(result, expected)
+    assert_allclose(result, expected, strict=True)