fft, ifft and grads work for even size inputs

4bf5cc7f · slefrancois · 9c989bb2 · 4bf5cc7f · 4bf5cc7f · 4bf5cc7f
--- a/doc/library/gpuarray/fft.txt
+++ b/doc/library/gpuarray/fft.txt
 .. _libdoc_gpuarray_fft:

 ===================================================
-:mod:`gpuarray.fft` -- Type classes
+:mod:`gpuarray.fft` -- Fast Fourier Transforms
 ===================================================

 .. automodule:: theano.gpuarray.fft

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -28,21 +28,6 @@ except (ImportError, Exception):


 class CuRFFTOp(Op):
-    """
-    Operator for the fast Fourier transform of a real-valued output on the GPU
-    using the scikits CUDA FFT through the gpuarray backend.
-
-    The input must be a real-valued float32 variable of dimensions (m, n). It
-    performs m 1-D FFTs of size n each.
-
-    The output is a GpuArray of dimensions (m, n/2+1, 2). The output contains
-    the n//2+1 non-trivial elements of the m real-valued FFTs. The real
-    and imaginary parts are stored as two float32 arrays, emulating complex64.
-    Since theano does not support complex number operations, care must be
-    taken to manually implement operators such as multiplication.
-
-    The module provides the convenience function curfft(input).
-    """

    __props__ = ()

@@ -131,31 +116,17 @@ class CuRFFTOp(Op):

    def grad(self, inputs, output_grads):
        gout, = output_grads
-        # gout = theano.printing.Print('aVANT')(gout)
-        gout = T.set_subtensor(gout[:,1:-1,:], gout[:,1:-1,:]*0.5) 
-        # gout = theano.printing.Print('apres')(gout)
+        # Divide the last dimension of the output gradients by 2, they are 
+        # double-counted by the real-IFFT due to symmetry, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gout.ndim - 2) + [slice(1,-1)] + [slice(None)]
+        gout = T.set_subtensor(gout[idx], gout[idx]*0.5) 
        return [cuirfft_op(gout)]

 curfft_op = CuRFFTOp()


 class CuIRFFTOp(Op):
-    """
-    Operator for the inverse fast Fourier transform with real-valued output
-    on the GPU using the scikits CUDA FFT through the gpuarray backend.
-
-    The input is a variable of dimensions (m, n/2+1, 2) with
-    type float32 representing the n/2+1 non-trivial elements of m
-    real-valued Fourier transforms of initial size n. The real and imaginary
-    parts are stored as two float32 arrays, emulating complex64 given that
-    Theano does not support complex numbers.
-
-    The output is a real-valued float32 variable of dimensions (m, n)
-    giving the m inverse FFTs. *The output is NOT normalized*. You can
-    manualy divide by the size of the output array to normalize.
-
-    The module provides the convenience function cuirfft(input).
-    """

    __props__ = ()

@@ -202,7 +173,7 @@ class CuIRFFTOp(Op):
            # chop off the extra length-2 dimension for real/imag
            output_shape = list(input_shape[:-1])
            # restore full signal length
-            output_shape[-1] = (output_shape[-1] - 1) * 2
+            output_shape[-1] = (output_shape[-1] - 1) * 2 
            # if inputs[0][0][0,-1,1] != 0:
            #     output_shape[-1] += 1
            output_shape = tuple(output_shape)
@@ -234,7 +205,7 @@ class CuIRFFTOp(Op):

                fft.ifft(input_pycuda, output_pycuda, plan[0])
                # strangely enough, enabling rescaling here makes it run
-                # very, very slowly.  so do this rescaling manually
+                # very, very slowly, so do this rescaling manually
                # afterwards!

                # Sync results to ensure output contains completed computation
@@ -249,283 +220,88 @@ class CuIRFFTOp(Op):
    def grad(self, inputs, output_grads):
        gout, = output_grads
        gf = curfft_op(gout)
-        gf = T.set_subtensor(gf[:,1:-1,:], gf[:,1:-1,:]*2)
+        # Multiply the last dimension of the gradient by 2, they represent
+        # both positive and negative frequencies, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gf.ndim - 2) + [slice(1,-1)] + [slice(None)]
+        gf = T.set_subtensor(gf[idx], gf[idx]*2)
        return [gf]

 cuirfft_op = CuIRFFTOp()

-class CuFFTOp(Op):
-    """
-    Operator for the fast Fourier transform of a real-valued output on the GPU
-    using the scikits CUDA FFT through the gpuarray backend.
-
-    The input must be a real-valued float32 variable of dimensions (m, n). It
-    performs m 1-D FFTs of size n each.
-
-    The output is a GpuArray of dimensions (m, n/2+1, 2). The output contains
-    the n//2+1 non-trivial elements of the m real-valued FFTs. The real
-    and imaginary parts are stored as two float32 arrays, emulating complex64.
-    Since theano does not support complex number operations, care must be
-    taken to manually implement operators such as multiplication.
-
-    The module provides the convenience function curfft(input).
-    """
-
-    __props__ = ()
-
-    def output_type(self, inp):
-        # add one extra dim for real/imag
-        return GpuArrayType(inp.dtype,
-                            broadcastable=[False] * (inp.type.ndim+1),
-                            context_name=inp.type.context_name)
-
-    def make_node(self, inp):
-        if not scikits_cuda_available:
-            raise RuntimeError("scikits.cuda is needed for CuFFTOp")
-
-        if not pygpu_available:
-            raise RuntimeError("pygpu is needed for CuFFTOp")
-
-        if not pycuda_available:
-            raise RuntimeError("pycuda is needed for CuFFTOp")
-        print(inp)
-
-        inp = basic_ops.gpu_contiguous(
-            basic_ops.as_gpuarray_variable(inp,
-                                           basic_ops.infer_context_name(inp)))
-        assert inp.dtype == "float32"
-
-        return theano.Apply(self, [inp], [self.output_type(inp)()])
-
-    def make_thunk(self, node, storage_map, _, _2):
-
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        # Initiliaze cuda context to the input's.
-        with node.inputs[0].type.context:
-            scikits.cuda.misc.init()
-
-        plan_input_shape = [None]
-        plan = [None]
-
-        def thunk():
-            input_shape = inputs[0][0].shape
-
-            # construct output shape
-            output_shape = list(input_shape)
-            # DFT of real input is symmetric, no need to store
-            # redundant coefficients
-            # output_shape[-1] = output_shape[-1] // 2 + 1
-            # extra dimension with length 2 for real/imag
-            output_shape += [2]
-            output_shape = tuple(output_shape)
-
-            z = outputs[0]
-
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
-                                   dtype='float32')
-
-            input_pycuda = T.stack(inputs[0][0],T.zeros_like(inputs[0][0]))
-            # I thought we'd need to change the type on output_pycuda
-            # so it is complex64, but as it turns out scikits.cuda.fft
-            # doesn't really care either way and treats the array as
-            # if it is complex64 anyway.
-            output_pycuda = z[0]
-
-            with input_pycuda.context:
-                # only initialise plan if necessary
-                if plan[0] is None or plan_input_shape[0] != input_shape:
-                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(input_shape[1:-1], np.complex64, np.complex64,
-                                       batch=input_shape[0])
-                # Sync GPU variables before computation
-                input_pycuda.sync()
-                output_pycuda.sync()
-
-                fft.fft(input_pycuda, output_pycuda, plan[0])
-                # Sync results to ensure output contains completed computation
-                pycuda.driver.Context.synchronize()
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        return thunk
-
-    def grad(self, inputs, output_grads):
-        gout, = output_grads
-        return [cuifft_op(gout)]
-
-cufft_op = CuFFTOp()
-
-
-class CuIFFTOp(Op):
-    """
-    Operator for the inverse fast Fourier transform with real-valued output
-    on the GPU using the scikits CUDA FFT through the gpuarray backend.
-
-    The input is a variable of dimensions (m, n/2+1, 2) with
-    type float32 representing the n/2+1 non-trivial elements of m
-    real-valued Fourier transforms of initial size n. The real and imaginary
-    parts are stored as two float32 arrays, emulating complex64 given that
-    Theano does not support complex numbers.
-
-    The output is a real-valued float32 variable of dimensions (m, n)
-    giving the m inverse FFTs. *The output is NOT normalized*. You can
-    manualy divide by the size of the output array to normalize.
-
-    The module provides the convenience function cuirfft(input).
+def curfft(inputs, norm=None):
    """
+    Performs the fast Fourier transform of a real-valued output on the GPU 
+    through the gpuarray backend.

-    __props__ = ()
-
-    def output_type(self, inp):
-        # add one extra dim for real/imag
-        return GpuArrayType(inp.dtype,
-                            broadcastable=[False] * (inp.type.ndim),
-                            context_name=inp.type.context_name)
-
-    def make_node(self, inp):
-        if not scikits_cuda_available:
-            raise RuntimeError("scikits.cuda is needed for CuIFFTOp")
-
-        if not pygpu_available:
-            raise RuntimeError("pygpu is needed for CuIFFTOp")
+    The input must be a real-valued float32 variable of dimensions (m, ..., n).
+    It performs FFTs of size (..., n) on m batches.

-        if not pycuda_available:
-            raise RuntimeError("pycuda is needed for CuIFFTOp")
-
-        inp = basic_ops.gpu_contiguous(
-            basic_ops.as_gpuarray_variable(inp,
-                                           basic_ops.infer_context_name(inp)))
-
-        assert inp.dtype == "float32"
-
-        return theano.Apply(self, [inp], [self.output_type(inp)()])
-
-    def make_thunk(self, node, storage_map, _, _2):
-
-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        # Initiliaze cuda context to the input's.
-        with node.inputs[0].type.context:
-            scikits.cuda.misc.init()
-
-        plan_input_shape = [None]
-        plan = [None]
-
-        def thunk():
-            input_shape = inputs[0][0].shape
-
-            # construct output shape
-            # chop off the extra length-2 dimension for real/imag
-            output_shape = list(input_shape)
-            # restore full signal length
-            # output_shape[-1] = (output_shape[-1] - 1) * 2
-            output_shape = tuple(output_shape)
-
-            z = outputs[0]
-
-            # only allocate if there is no previous allocation of the
-            # right size.
-            if z[0] is None or z[0].shape != output_shape:
-                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
-                                   dtype='float32')
-
-            input_pycuda = inputs[0][0]
-            # input_pycuda is a float32 array with an extra dimension,
-            # but will be interpreted by scikits.cuda as a complex64
-            # array instead.
-            output_pycuda = z[0]
-
-            with input_pycuda.context:
-                # only initialise plan if necessary
-                if plan[0] is None or plan_input_shape[0] != input_shape:
-                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(input_shape[1:-1],
-                                       np.complex64, np.complex64,
-                                       batch=output_shape[0])
-                # Sync GPU variables before computation
-                input_pycuda.sync()
-                output_pycuda.sync()
-
-                fft.ifft(input_pycuda, output_pycuda, plan[0])
-                # strangely enough, enabling rescaling here makes it run
-                # very, very slowly.  so do this rescaling manually
-                # afterwards!
-
-                # Sync results to ensure output contains completed computation
-                pycuda.driver.Context.synchronize()
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        return thunk
-        
-    def grad(self, inputs, output_grads):
-        gout, = output_grads
-        return [cufft_op(gout)]
-
-cuifft_op = CuIFFTOp()
-
-def curfft(inputs, norm=None):
-    """
-    Performs the real-valued input fast Fourier Transform using the
-    gpuarray backend.
+    The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to 
+    last dimension of the output contains the n//2+1 non-trivial elements of
+    the real-valued FFTs. The real and imaginary parts are stored as two
+    float32 arrays, emulating complex64. Since theano does not support complex
+    number operations, care must be taken to manually implement operators such
+    as multiplication.

    Parameters
    ----------
    inputs
-        Array of real-valued float32 of size (m, n), containing m inputs of
-        length n.
+        Array of real-valued float32 of size (m, ..., n), containing m inputs of
+        size (..., n).
    norm : {None, 'ortho', 'no_norm'}
        Normalization of transform. Following numpy, default *None* normalizes
        only the inverse transform by n, 'ortho' yields the unitary transform
-        (:math:`1/\sqrt n` forward and back). In addition, 'no_norm' leaves
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
        the transform unnormalized.
+
    """

    cond_norm = _unitary(norm)
-    if cond_norm is None:
+    if cond_norm is None or cond_norm == "no_norm":
        return curfft_op(inputs)
    elif cond_norm == "ortho":
        return curfft_op(inputs) / T.sqrt(((inputs.shape[1:]).prod())
                                          .astype('float32'))
-    elif cond_norm == "no_norm":
-        return curfft_op(inputs)
-
+                                          

 def cuirfft(inputs, norm=None):
    """
    Performs the real-valued output inverse Fourier Transform using the
    gpuarray backend.

+    The input is a variable of dimensions (m, ..., n//2+1, 2) with
+    type float32 representing the non-trivial elements of m
+    real-valued Fourier transforms of initial size (..., n). The real and
+    imaginary parts are stored as two float32 arrays, emulating complex64
+    given that Theano does not support complex numbers.
+
+    The output is a real-valued float32 variable of dimensions (m, ..., n)
+    giving the m inverse FFTs. 
+
    Parameters
    ----------
    inputs
-        Array of float32 of size (m, n/2+1, 2), containing m inputs with n/2+1
-        non-trivial elements and real and imaginary parts stored as separate
-        arrays.
+        Array of float32 of size (m, ..., n//2+1, 2), containing m inputs 
+        with n/2+1 non-trivial elements on the last dimension and real
+        and imaginary parts stored as separate arrays.
    norm : {None, 'ortho', 'no_norm'}
        Normalization of transform. Following numpy, default *None* normalizes
        only the inverse transform by n, 'ortho' yields the unitary transform
-        (:math:`1/\sqrt n` forward and back). In addition, 'no_norm' leaves
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
        the transform unnormalized.
+        
    """

    cond_norm = _unitary(norm)
    if cond_norm is None:
-        return cuirfft_op(inputs) / (((inputs.shape[1:-1] - 1) * 2).prod()
+        return cuirfft_op(inputs) / ((inputs.shape[1:-2].prod() *
+                                    ((inputs.shape[-2] - 1) * 2))
                                     .astype('float32'))
    if cond_norm == "ortho":
-        return cuirfft_op(inputs) / T.sqrt((((inputs.shape[1:-1] - 1) * 2).prod())
-                                           .astype('float32'))
+        return cuirfft_op(inputs) / T.sqrt((inputs.shape[1:-2].prod() *
+                                    ((inputs.shape[-2] - 1) * 2))
+                                     .astype('float32'))
    if cond_norm == "no_norm":
        return cuirfft_op(inputs)


--- a/theano/gpuarray/tests/test_fft.py
+++ b/theano/gpuarray/tests/test_fft.py
@@ -25,151 +25,143 @@ if not pycuda_available:  # noqa
 import theano.gpuarray.cuda_fft

 # Transform sizes
-N = 64
+N = 16


 class TestFFT(unittest.TestCase):

-    # def test_rfft(self):
-    #     inputs_val = np.random.random((1, N)).astype('float32')
-    #     inputs = theano.shared(inputs_val)
-    # 
-    #     rfft = theano.gpuarray.fft.curfft(inputs)
-    #     f_rfft = theano.function([], rfft, mode=mode_with_gpu)
-    #     res_rfft = f_rfft()
-    #     res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
-    #                      1j * np.asarray(res_rfft[:, :, 1]))
+    def test_rfft(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+    
+        rfft = theano.gpuarray.fft.curfft(inputs)
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+    
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1,2))
+    
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+    
+    def test_irfft(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+    
+        fft = theano.gpuarray.fft.curfft(inputs)
+        f_fft = theano.function([], fft, mode=mode_with_gpu)
+        res_fft = f_fft()
+    
+        m = fft.type()
+        ifft = theano.gpuarray.fft.cuirfft(m)
+        f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
+        res_ifft = f_ifft(res_fft)
+    
+        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
+    
+    def test_type(self):
+        inputs_val = np.random.random((1, N)).astype('float64')
+        inputs = theano.shared(inputs_val)
+    
+        with self.assertRaises(AssertionError):
+            theano.gpuarray.fft.curfft(inputs)
+        with self.assertRaises(AssertionError):
+            theano.gpuarray.fft.cuirfft(inputs)
+    
+    def test_norm(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+    
+        # Unitary normalization
+        rfft = theano.gpuarray.fft.curfft(inputs, norm='ortho')
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+    
+        rfft_ref_ortho = numpy.fft.rfftn(inputs_val, axes=(1,2), norm='ortho')
+    
+        utt.assert_allclose(rfft_ref_ortho, res_rfft_comp,
+        atol=1e-4, rtol=1e-4)
+    
+        # No normalization
+        rfft = theano.gpuarray.fft.curfft(inputs, norm='no_norm')
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+    
+        utt.assert_allclose(rfft_ref_ortho * np.sqrt(N*N),
+            res_rfft_comp, atol=1e-4, rtol=1e-4)
+    
+        # Inverse FFT inputs
+        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
+    
+        # Unitary normalization inverse FFT
+        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho')
+        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft()
+    
+        irfft_ref_ortho = numpy.fft.irfftn(inputs_ref, axes=(1,2), norm='ortho')
+    
+        utt.assert_allclose(irfft_ref_ortho,
+        res_irfft, atol=1e-4, rtol=1e-4)
+    
+        # No normalization inverse FFT
+        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='no_norm')
+        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft()
+    
+        utt.assert_allclose(irfft_ref_ortho * np.sqrt(N*N),
+            res_irfft, atol=1e-4, rtol=1e-4)
+
+    def test_grad(self):
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+    
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        utt.verify_grad(theano.gpuarray.fft.curfft_op, [inputs_val], eps=eps)
+    
+        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(theano.gpuarray.fft.cuirfft_op, [inputs_val], eps=eps)
+    #     
    # 
-    #     rfft_ref = numpy.fft.rfft(inputs_val, N, 1)
+    # def test_odd(self):
+    #     M = N - 1
    # 
-    #     utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
-
-    # def test_irfft(self):
-    #     inputs_val = np.random.random((1, N)).astype('float32')
-    #     inputs = theano.shared(inputs_val)
-    #
-    #     fft = theano.gpuarray.fft.curfft(inputs)
-    #     f_fft = theano.function([], fft, mode=mode_with_gpu)
-    #     res_fft = f_fft()
-    #
-    #     m = fft.type()
-    #     ifft = theano.gpuarray.fft.cuirfft(m)
-    #     f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
-    #     res_ifft = f_ifft(res_fft)
-    #
-    #     utt.assert_allclose(inputs_val, np.asarray(res_ifft))
-    #
-    # def test_type(self):
-    #     inputs_val = np.random.random((1, N)).astype('float64')
+    #     inputs_val = np.random.random((1, M, M)).astype('float32')
    #     inputs = theano.shared(inputs_val)
-    #
-    #     with self.assertRaises(AssertionError):
-    #         theano.gpuarray.fft.curfft(inputs)
-    #     with self.assertRaises(AssertionError):
-    #         theano.gpuarray.fft.cuirfft(inputs)
-    #
-    # def test_norm(self):
-    #     inputs_val = np.random.random((1, N)).astype('float32')
-    #     inputs = theano.shared(inputs_val)
-    #
-    #     # Unitary normalization
-    #     rfft = theano.gpuarray.fft.curfft(inputs, norm='ortho')
-    #     f_rfft = theano.function([], rfft, mode=mode_with_gpu)
-    #     res_rfft = f_rfft()
-    #     res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
-    #                      1j * np.asarray(res_rfft[:, :, 1]))
-    #
-    #     rfft_ref_ortho = numpy.fft.rfft(inputs_val, N, 1, norm='ortho')
-    #
-    #     utt.assert_allclose(rfft_ref_ortho, res_rfft_comp)
-    #
-    #     # No normalization
+    # 
    #     rfft = theano.gpuarray.fft.curfft(inputs, norm='no_norm')
    #     f_rfft = theano.function([], rfft, mode=mode_with_gpu)
    #     res_rfft = f_rfft()
-    #     res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
-    #                      1j * np.asarray(res_rfft[:, :, 1]))
-    #
-    #     utt.assert_allclose(rfft_ref_ortho * np.sqrt(N), res_rfft_comp)
-    #
-    #     # Inverse FFT inputs
-    #     inputs_val = np.random.random((1, N // 2 + 1, 2)).astype('float32')
-    #     inputs = theano.shared(inputs_val)
-    #     inputs_ref = inputs_val[:, :, 0] + 1j * inputs_val[:, :, 1]
-    #
-    #     # Unitary normalization inverse FFT
-    #     irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho')
-    #     f_irfft = theano.function([], irfft, mode=mode_with_gpu)
-    #     res_irfft = f_irfft()
-    #
-    #     irfft_ref_ortho = numpy.fft.irfft(inputs_ref, norm='ortho')
-    #
-    #     utt.assert_allclose(irfft_ref_ortho, res_irfft)
-    #
-    #     # No normalization inverse FFT
-    #     irfft = theano.gpuarray.fft.cuirfft(inputs, norm='no_norm')
-    #     f_irfft = theano.function([], irfft, mode=mode_with_gpu)
-    #     res_irfft = f_irfft()
-    #
-    #     utt.assert_allclose(irfft_ref_ortho * np.sqrt(N), res_irfft)
-
-    # def test_fft(self):
-    #     # inputs_val = np.random.random((1, N, N)).astype('float32')
-    #     # inputs = theano.shared(inputs_val)
-    #     #
-    #     # fft = theano.gpuarray.fft.cufft_op(inputs)
-    #     # f_fft = theano.function([], fft, mode=mode_with_gpu)
-    #     # res_fft = f_fft()
-    #     # res_fft_comp = (np.asarray(res_fft[:, :,:, 0]) +
-    #     #                  1j * np.asarray(res_fft[:, :,:, 1]))
-    #     #
-    #     # # inputs_ref = inputs_val[:,:,:,0] + 1j*inputs_val[:,:,:,1]
-    #     # fft_ref = numpy.fft.fftn(inputs_val, (N,N), axes=(1,2))
-    # 
-    #     inputs_val = np.random.random((1, N, 2)).astype('float32')
-    #     # inputs = theano.shared(inputs_val)
-    #     inputs = T.tensor3('inputs', dtype='float32')
    # 
-    #     fft = theano.gpuarray.fft.cufft_op(inputs)
-    #     f_fft = theano.function([inputs], fft, mode=mode_with_gpu)
-    #     res_fft = f_fft(inputs_val)
-    #     res_fft_comp = (np.asarray(res_fft[:, :, 0]) +
-    #                      1j * np.asarray(res_fft[:, :, 1]))
+    #     res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+    #                      1j * np.asarray(res_rfft[:, :, :, 1]))
    # 
-    #     inputs_ref = inputs_val[:,:,0] + 1j*inputs_val[:,:,1]
-    #     fft_ref = numpy.fft.fft(inputs_ref, N, 1)
+    #     rfft_ref = numpy.fft.rfftn(inputs_val, s=(M,M), axes=(1,2))#, s=(M, M), axes=(1,2))
    # 
-    #     utt.assert_allclose(fft_ref, res_fft_comp, atol=1e-4, rtol=1e-4)
-
-    # def test_ifft(self):
-    #     inputs_val = np.random.random((1, N, 2)).astype('float32')
-    #     inputs = theano.shared(inputs_val)
+    #     utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4) 
    # 
-    #     fft = theano.gpuarray.fft.cufft_op(inputs)
-    #     f_fft = theano.function([], fft, mode=mode_with_gpu)
-    #     res_fft = f_fft()
-    # 
-    #     m = fft.type()
-    #     ifft = theano.gpuarray.fft.cuifft_op(m)
+    #     m = rfft.type()
+    #     ifft = theano.gpuarray.fft.cuirfft(m, norm='no_norm')
    #     f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
-    #     res_ifft = f_ifft(res_fft)
+    #     res_ifft = f_ifft(res_rfft)
    # 
-    #     utt.assert_allclose(inputs_val, np.asarray(res_ifft) / N)
-
-    def test_grad(self):
-        # The numerical gradient of the FFT is sensitive, must set large
-        # enough epsilon to get good accuracy.
-        eps = 1e-1
-        
-        inputs_val = np.random.random((1, N)).astype('float32')
-        utt.verify_grad(theano.gpuarray.fft.curfft_op, [inputs_val], eps=eps)
-
-        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype('float32')
-        utt.verify_grad(theano.gpuarray.fft.cuirfft_op, [inputs_val], eps=eps)
+    #     utt.assert_allclose(inputs_val, np.asarray(res_ifft)/M**2)
        
-        # M = 61
-        # inputs_val = np.random.random((1, M)).astype('float32')
-        # utt.verify_grad(theano.gpuarray.fft.curfft_op, [inputs_val], eps=eps)
+        # inputs_val = np.random.random((1, M//2+1, 2)).astype('float32')
+        # # inputs_val[0,0,1] = 0
+        # inputs = theano.shared(inputs_val)
+        # 
+        # irfft = theano.gpuarray.fft.cuirfft(inputs)
+        # f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        # res_irfft = f_irfft()
+        # 
+        # inputs_ref = inputs_val[:, :, 0] + 1j * inputs_val[:, :, 1]
+        # irfft_ref = numpy.fft.irfft(inputs_ref, n=M, axis=1)#, s=(M, M), axes=(1,2))
        # 
-        # inputs_val = np.random.random((1, M // 2 + 1, 2)).astype('float32')
-        # utt.verify_grad(theano.gpuarray.fft.cuirfft_op, [inputs_val], eps=eps)
+        # utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)