Merge pull request #4535 from slefrancois/fft_gpuarray

Port FFTs to gpuarray backend

Merge pull request #4535 from slefrancois/fft_gpuarray
f0a9781a · abergeron · GitHub · 76afe07b · 7adcb4b9 · f0a9781a
--- a/doc/library/gpuarray/fft.txt
+++ b/doc/library/gpuarray/fft.txt
+.. _libdoc_gpuarray_fft:
+
+==============================================
+:mod:`gpuarray.fft` -- Fast Fourier Transforms
+==============================================
+
+Performs Fast Fourier Transforms (FFT) on the GPU.
+
+FFT gradients are implemented as the opposite Fourier transform of the output gradients.
+
+.. note ::
+    You must install `scikit-cuda <http://scikit-cuda.readthedocs.io/en/latest>`_
+    to compute Fourier transforms on the GPU.
+
+
+.. warning ::
+    The real and imaginary parts of the Fourier domain arrays are stored as a pair of float32
+    arrays, emulating complex64. Since theano has limited support for complex
+    number operations, care must be taken to manually implement operations such as gradients.
+
+.. automodule:: theano.gpuarray.fft
+   :members: curfft, cuirfft
+
+For example, the code below performs the real input FFT of a box function, which is a sinc function.
+The absolute value is plotted, since the phase oscillates due to the box function being
+shifted to the middle of the array. The Theano flag ``device=cuda{0,1...}`` must be used.
+
+.. testcode::
+
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    from theano.gpuarray import fft
+
+    x = T.matrix('x', dtype='float32')
+
+    rfft = fft.curfft(x, norm='ortho')
+    f_rfft = theano.function([x], rfft)
+
+    N = 1024
+    box = np.zeros((1, N), dtype='float32')
+    box[:, N/2-10: N/2+10] = 1
+
+    out = f_rfft(box)
+    c_out = np.asarray(out[0, :, 0] + 1j*out[0, :, 1])
+    abs_out = abs(c_out)
+
+.. testoutput::
+   :hide:
+   :options: +SKIP
+
+   ...
+
+.. image:: plot_fft.png
--- a/doc/library/gpuarray/index.txt
+++ b/doc/library/gpuarray/index.txt
@@ -15,5 +15,6 @@

    op
    dnn
+    fft
    type
    extra
--- a/doc/library/gpuarray/plot_fft.png
+++ b/doc/library/gpuarray/plot_fft.png
--- a/doc/library/tensor/fft.txt
+++ b/doc/library/tensor/fft.txt
+.. _libdoc_tensor_fft:
+
+==============================================
+:mod:`tensor.fft` -- Fast Fourier Transforms
+==============================================
+
+Performs Fast Fourier Transforms (FFT).
+
+FFT gradients are implemented as the opposite Fourier transform of the output gradients.
+
+.. warning ::
+    The real and imaginary parts of the Fourier domain arrays are stored as a pair of float
+    arrays, emulating complex. Since theano has limited support for complex
+    number operations, care must be taken to manually implement operations such as gradients.
+
+.. automodule:: theano.tensor.fft
+   :members: rfft, irfft
+
+For example, the code below performs the real input FFT of a box function,
+which is a sinc function. The absolute value is plotted, since the phase
+oscillates due to the box function being shifted to the middle of the array.
+
+.. testcode::
+
+    import numpy as np
+    import theano
+    import theano.tensor as T
+    from theano.tensor import fft
+
+    x = T.matrix('x', dtype='float64')
+
+    rfft = fft.rfft(x, norm='ortho')
+    f_rfft = theano.function([x], rfft)
+
+    N = 1024
+    box = np.zeros((1, N), dtype='float64')
+    box[:, N/2-10: N/2+10] = 1
+
+    out = f_rfft(box)
+    c_out = np.asarray(out[0, :, 0] + 1j*out[0, :, 1])
+    abs_out = abs(c_out)
+
+.. image:: plot_fft.png
--- a/doc/library/tensor/index.txt
+++ b/doc/library/tensor/index.txt
@@ -29,3 +29,4 @@ They are grouped into the following sections:
    opt
    slinalg
    nlinalg
+    fft
--- a/doc/library/tensor/plot_fft.png
+++ b/doc/library/tensor/plot_fft.png
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import dnn, opt, nerv, extra_ops
+from . import fft, dnn, opt, nerv, extra_ops

 def transfer(x, target):
    try:

--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
+from __future__ import absolute_import, print_function, division
+
+import numpy as np
+import theano
+from theano import Op
+import theano.tensor as T
+from theano.gradient import DisconnectedType
+
+from theano.gpuarray import (basic_ops, GpuArrayType)
+
+import theano.tensor.fft
+from .opt import register_opt, op_lifter
+
+try:
+    import pygpu
+    pygpu_available = True
+except ImportError:
+    pygpu_available = False
+
+try:
+    import pycuda.driver
+    pycuda_available = True
+except ImportError:
+    pycuda_available = False
+
+try:
+    import skcuda
+    from skcuda import fft
+    scikits_cuda_available = True
+except (ImportError, Exception):
+    scikits_cuda_available = False
+
+
+class CuRFFTOp(Op):
+
+    __props__ = ()
+
+    def output_type(self, inp):
+        # add one extra dim for real/imag
+        return GpuArrayType(inp.dtype,
+                            broadcastable=[False] * (inp.type.ndim + 1),
+                            context_name=inp.type.context_name)
+
+    def make_node(self, inp, s=None):
+        # A shape parameter s can be provided as an input. For now this is used to
+        # manage odd transform sizes.
+        # Later this could be extended to handle padding and trunkation,
+        # following numpy's interface. However, cuFFT expects array that match
+        # the shape given to the plan, so padding will have to be done in the op.
+        # The effect of padding on gradients has yet to be investigated.
+
+        if not scikits_cuda_available:
+            raise RuntimeError("skcuda is needed for CuFFTOp")
+
+        if not pygpu_available:
+            raise RuntimeError("pygpu is needed for CuFFTOp")
+
+        if not pycuda_available:
+            raise RuntimeError("pycuda is needed for CuFFTOp")
+
+        inp = basic_ops.gpu_contiguous(
+            basic_ops.as_gpuarray_variable(inp,
+                                           basic_ops.infer_context_name(inp)))
+
+        # If no shape is provided as input, default to input data shape.
+        if s is None:
+            s = inp.shape[1:]
+        s = T.as_tensor_variable(s)
+
+        assert inp.dtype == "float32"
+        assert s.ndim == 1
+        assert 'int' in s.dtype
+
+        return theano.Apply(self, [inp, s], [self.output_type(inp)()])
+
+    def make_thunk(self, node, storage_map, _, _2):
+
+        inputs = [storage_map[v] for v in node.inputs]
+        outputs = [storage_map[v] for v in node.outputs]
+
+        # Initiliaze cuda context to the input's.
+        with node.inputs[0].type.context:
+            skcuda.misc.init()
+
+        plan_input_shape = [None]
+        plan = [None]
+
+        def thunk():
+            input_shape = inputs[0][0].shape
+            s = inputs[1][0]
+
+            # Since padding is not supported, assert s matches input shape.
+            assert (input_shape[1:] == s).all()
+
+            # construct output shape
+            output_shape = [input_shape[0]] + list(s)
+            # DFT of real input is symmetric, no need to store
+            # redundant coefficients
+            output_shape[-1] = output_shape[-1] // 2 + 1
+            # extra dimension with length 2 for real/imag
+            output_shape += [2]
+            output_shape = tuple(output_shape)
+
+            z = outputs[0]
+
+            # only allocate if there is no previous allocation of the
+            # right size.
+            if z[0] is None or z[0].shape != output_shape:
+                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
+                                   dtype='float32')
+
+            input_pycuda = inputs[0][0]
+            # I thought we'd need to change the type on output_pycuda
+            # so it is complex64, but as it turns out skcuda.fft
+            # doesn't really care either way and treats the array as
+            # if it is complex64 anyway.
+            output_pycuda = z[0]
+
+            with input_pycuda.context:
+                # only initialise plan if necessary
+                if plan[0] is None or plan_input_shape[0] != input_shape:
+                    plan_input_shape[0] = input_shape
+                    plan[0] = fft.Plan(s, np.float32, np.complex64,
+                                       batch=input_shape[0])
+
+                # Sync GPU variables before computation
+                input_pycuda.sync()
+                output_pycuda.sync()
+
+                fft.fft(input_pycuda, output_pycuda, plan[0])
+
+                # Sync results to ensure output contains completed computation
+                pycuda.driver.Context.synchronize()
+
+        thunk.inputs = inputs
+        thunk.outputs = outputs
+        thunk.lazy = False
+
+        return thunk
+
+    def grad(self, inputs, output_grads):
+        gout, = output_grads
+        s = inputs[1]
+        # Divide the last dimension of the output gradients by 2, they are
+        # double-counted by the real-IFFT due to symmetry, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gout.ndim - 2) \
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gout = T.set_subtensor(gout[idx], gout[idx] * 0.5)
+        return [cuirfft_op(gout, s), DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]
+
+curfft_op = CuRFFTOp()
+
+
+class CuIRFFTOp(Op):
+
+    __props__ = ()
+
+    def output_type(self, inp):
+        # remove extra dim for real/imag
+        return GpuArrayType(inp.dtype,
+                            broadcastable=[False] * (inp.type.ndim - 1),
+                            context_name=inp.type.context_name)
+
+    def make_node(self, inp, s=None):
+        # A shape parameter is expected as an input. For now this is used to
+        # manage odd transform sizes.
+        # Later this could be extended to handle padding and trunkation,
+        # following numpy's interface. However, cuFFT expects array that match
+        # the shape given to the plan, so padding will have to be done in the op.
+        # The effect of padding on gradients has yet to be investigated.
+
+        if not scikits_cuda_available:
+            raise RuntimeError("skcuda is needed for CuIFFTOp")
+
+        if not pygpu_available:
+            raise RuntimeError("pygpu is needed for CuIFFTOp")
+
+        if not pycuda_available:
+            raise RuntimeError("pycuda is needed for CuIFFTOp")
+
+        inp = basic_ops.gpu_contiguous(
+            basic_ops.as_gpuarray_variable(inp,
+                                           basic_ops.infer_context_name(inp)))
+
+        # If no shape is provided as input, calculate shape assuming even real transform.
+        if s is None:
+            s = inp.shape[1:-1]
+            s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+        s = T.as_tensor_variable(s)
+
+        assert inp.dtype == "float32"
+        assert s.ndim == 1
+
+        return theano.Apply(self, [inp, s], [self.output_type(inp)()])
+
+    def make_thunk(self, node, storage_map, _, _2):
+
+        inputs = [storage_map[v] for v in node.inputs]
+        outputs = [storage_map[v] for v in node.outputs]
+
+        # Initiliaze cuda context to the input's.
+        with node.inputs[0].type.context:
+            skcuda.misc.init()
+
+        plan_input_shape = [None]
+        plan = [None]
+
+        def thunk():
+            input_shape = inputs[0][0].shape
+            s = inputs[1][0]
+
+            # Since padding is not supported, assert that last dimension corresponds to
+            # input forward transform size.
+            assert (input_shape[1:-2] == s[:-1]).all()
+            assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()
+
+            # construct output shape
+            # chop off the extra length-2 dimension for real/imag
+            output_shape = [input_shape[0]] + list(s)
+            output_shape = tuple(output_shape)
+
+            z = outputs[0]
+
+            # only allocate if there is no previous allocation of the
+            # right size.
+            if z[0] is None or z[0].shape != output_shape:
+                z[0] = pygpu.zeros(output_shape, context=inputs[0][0].context,
+                                   dtype='float32')
+
+            input_pycuda = inputs[0][0]
+            # input_pycuda is a float32 array with an extra dimension,
+            # but will be interpreted by skcuda as a complex64
+            # array instead.
+            output_pycuda = z[0]
+
+            with input_pycuda.context:
+                # only initialise plan if necessary
+                if plan[0] is None or plan_input_shape[0] != input_shape:
+                    plan_input_shape[0] = input_shape
+                    plan[0] = fft.Plan(s, np.complex64, np.float32,
+                                       batch=output_shape[0])
+
+                # Sync GPU variables before computation
+                input_pycuda.sync()
+                output_pycuda.sync()
+
+                fft.ifft(input_pycuda, output_pycuda, plan[0])
+                # strangely enough, enabling rescaling here makes it run
+                # very, very slowly, so do this rescaling manually
+                # afterwards!
+
+                # Sync results to ensure output contains completed computation
+                pycuda.driver.Context.synchronize()
+
+        thunk.inputs = inputs
+        thunk.outputs = outputs
+        thunk.lazy = False
+
+        return thunk
+
+    def grad(self, inputs, output_grads):
+        gout, = output_grads
+        s = inputs[1]
+        gf = curfft_op(gout, s)
+        # Multiply the last dimension of the gradient by 2, they represent
+        # both positive and negative frequencies, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gf.ndim - 2) \
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gf = T.set_subtensor(gf[idx], gf[idx] * 2)
+        return [gf, DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]
+
+cuirfft_op = CuIRFFTOp()
+
+
+def curfft(inp, norm=None):
+    """
+    Performs the fast Fourier transform of a real-valued input on the GPU.
+
+    The input must be a real-valued float32 variable of dimensions (m, ..., n).
+    It performs FFTs of size (..., n) on m batches.
+
+    The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to
+    last dimension of the output contains the n//2+1 non-trivial elements of
+    the real-valued FFTs. The real and imaginary parts are stored as a pair of
+    float32 arrays.
+
+    Parameters
+    ----------
+    inp
+        Array of real-valued float32 of size (m, ..., n), containing m inputs of
+        size (..., n).
+    norm : {None, 'ortho', 'no_norm'}
+        Normalization of transform. Following numpy, default *None* normalizes
+        only the inverse transform by n, 'ortho' yields the unitary transform
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
+        the transform unnormalized.
+
+    """
+
+    s = inp.shape[1:]
+    cond_norm = _unitary(norm)
+    scaling = 1
+    if cond_norm == "ortho":
+        scaling = T.sqrt(s.prod().astype('float32'))
+
+    return curfft_op(inp, s) / scaling
+
+
+def cuirfft(inp, norm=None, is_odd=False):
+    """
+    Performs the inverse fast Fourier Transform with real-valued output on the GPU.
+
+    The input is a variable of dimensions (m, ..., n//2+1, 2) with
+    type float32 representing the non-trivial elements of m
+    real-valued Fourier transforms of initial size (..., n). The real and
+    imaginary parts are stored as a pair of float32 arrays.
+
+    The output is a real-valued float32 variable of dimensions (m, ..., n)
+    giving the m inverse FFTs.
+
+    Parameters
+    ----------
+    inp
+        Array of float32 of size (m, ..., n//2+1, 2), containing m inputs
+        with n//2+1 non-trivial elements on the last dimension and real
+        and imaginary parts stored as separate arrays.
+    norm : {None, 'ortho', 'no_norm'}
+        Normalization of transform. Following numpy, default *None* normalizes
+        only the inverse transform by n, 'ortho' yields the unitary transform
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
+        the transform unnormalized.
+    is_odd : {True, False}
+        Set to True to get a real inverse transform output with an odd last dimension
+        of length (N-1)*2 + 1 for an input last dimension of length N.
+
+    """
+
+    if is_odd not in (True, False):
+        raise ValueError("Invalid value %s for id_odd, must be True or False" % is_odd)
+
+    s = inp.shape[1:-1]
+    if is_odd:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
+    else:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+
+    cond_norm = _unitary(norm)
+    scaling = 1
+    if cond_norm is None:
+        scaling = s.prod().astype('float32')
+    elif cond_norm == "ortho":
+        scaling = T.sqrt(s.prod().astype('float32'))
+
+    return cuirfft_op(inp, s) / scaling
+
+
+def _unitary(norm):
+    if norm not in (None, "ortho", "no_norm"):
+        raise ValueError("Invalid value %s for norm, must be None, 'ortho' or "
+                         "'no norm'" % norm)
+    return norm
+
+if scikits_cuda_available:
+    @register_opt('fast_compile')
+    @op_lifter([theano.tensor.fft.RFFTOp])
+    def local_curfft_op(node, context_name):
+        return curfft_op
+
+    @register_opt('fast_compile')
+    @op_lifter([theano.tensor.fft.IRFFTOp])
+    def local_cuirfft_op(node, context_name):
+        return cuirfft_op
--- a/theano/gpuarray/tests/test_fft.py
+++ b/theano/gpuarray/tests/test_fft.py
+from __future__ import absolute_import, print_function, division
+import unittest
+import numpy as np
+
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+
+import theano.gpuarray.fft
+import numpy.fft
+
+from .config import mode_with_gpu
+
+# Skip tests if pygpu is not available.
+from nose.plugins.skip import SkipTest
+from theano.gpuarray.fft import pygpu_available, scikits_cuda_available, pycuda_available
+if not pygpu_available:  # noqa
+    raise SkipTest('Optional package pygpu not available')
+if not scikits_cuda_available:  # noqa
+    raise SkipTest('Optional package scikits.cuda not available')
+if not pycuda_available:  # noqa
+    raise SkipTest('Optional package pycuda not available')
+
+# Transform sizes
+N = 32
+
+
+class TestFFT(unittest.TestCase):
+
+    def test_1Dfft(self):
+        inputs_val = np.random.random((1, N)).astype('float32')
+
+        x = T.matrix('x', dtype='float32')
+        rfft = theano.gpuarray.fft.curfft(x)
+        f_rfft = theano.function([x], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft(inputs_val)
+        res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, 1]))
+
+        rfft_ref = numpy.fft.rfft(inputs_val, axis=1)
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp)
+
+        m = rfft.type()
+        irfft = theano.gpuarray.fft.cuirfft(m)
+        f_irfft = theano.function([m], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft(res_rfft)
+
+        utt.assert_allclose(inputs_val, np.asarray(res_irfft))
+
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return theano.gpuarray.fft.curfft(inp)
+        inputs_val = np.random.random((1, N)).astype('float32')
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return theano.gpuarray.fft.cuirfft(inp)
+        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_rfft(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        rfft = theano.gpuarray.fft.curfft(inputs)
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+    def test_irfft(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        fft = theano.gpuarray.fft.curfft(inputs)
+        f_fft = theano.function([], fft, mode=mode_with_gpu)
+        res_fft = f_fft()
+
+        m = fft.type()
+        ifft = theano.gpuarray.fft.cuirfft(m)
+        f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
+        res_ifft = f_ifft(res_fft)
+
+        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
+
+        inputs_val = numpy.random.random((1, N, N, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        irfft = theano.gpuarray.fft.cuirfft(inputs)
+        f_irfft = theano.function([], irfft)
+        res_irfft = f_irfft()
+        inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j
+
+        irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))
+
+        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
+
+    def test_type(self):
+        inputs_val = np.random.random((1, N)).astype('float64')
+        inputs = theano.shared(inputs_val)
+
+        with self.assertRaises(AssertionError):
+            theano.gpuarray.fft.curfft(inputs)
+        with self.assertRaises(AssertionError):
+            theano.gpuarray.fft.cuirfft(inputs)
+
+    def test_norm(self):
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        # Unitary normalization
+        rfft = theano.gpuarray.fft.curfft(inputs, norm='ortho')
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref / N, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+        # No normalization
+        rfft = theano.gpuarray.fft.curfft(inputs, norm='no_norm')
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+        # Inverse FFT inputs
+        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
+
+        # Unitary normalization inverse FFT
+        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho')
+        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft()
+
+        irfft_ref = numpy.fft.irfftn(inputs_ref, axes=(1, 2))
+
+        utt.assert_allclose(irfft_ref * N, res_irfft, atol=1e-4, rtol=1e-4)
+
+        # No normalization inverse FFT
+        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='no_norm')
+        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft()
+
+        utt.assert_allclose(irfft_ref * N**2, res_irfft, atol=1e-4, rtol=1e-4)
+
+    def test_grad(self):
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return theano.gpuarray.fft.curfft(inp)
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return theano.gpuarray.fft.cuirfft(inp)
+        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+        def f_rfft(inp):
+            return theano.gpuarray.fft.curfft(inp, norm='ortho')
+        inputs_val = np.random.random((1, N, N)).astype('float32')
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return theano.gpuarray.fft.cuirfft(inp, norm='no_norm')
+        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_odd(self):
+        M = N - 1
+
+        inputs_val = np.random.random((1, M, M)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        rfft = theano.gpuarray.fft.curfft(inputs)
+        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
+        res_rfft = f_rfft()
+
+        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
+                         1j * np.asarray(res_rfft[:, :, :, 1]))
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, s=(M, M), axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+        m = rfft.type()
+        ifft = theano.gpuarray.fft.cuirfft(m, is_odd=True)
+        f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
+        res_ifft = f_ifft(res_rfft)
+
+        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
+
+        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho', is_odd=True)
+        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
+        res_irfft = f_irfft()
+
+        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
+        irfft_ref = numpy.fft.irfftn(
+            inputs_ref, s=(M, M), axes=(1, 2), norm='ortho')
+
+        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
+
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return theano.gpuarray.fft.curfft(inp)
+        inputs_val = np.random.random((1, M, M)).astype('float32')
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return theano.gpuarray.fft.cuirfft(inp, is_odd=True)
+        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+        def f_rfft(inp):
+            return theano.gpuarray.fft.curfft(inp, norm='ortho')
+        inputs_val = np.random.random((1, M, M)).astype('float32')
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return theano.gpuarray.fft.cuirfft(inp, norm='no_norm', is_odd=True)
+        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_params(self):
+        inputs_val = numpy.random.random((1, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        self.assertRaises(ValueError, theano.gpuarray.fft.curfft, inputs, norm=123)
+
+        inputs_val = numpy.random.random((1, N // 2 + 1, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+
+        self.assertRaises(ValueError, theano.gpuarray.fft.cuirfft, inputs, norm=123)
+        self.assertRaises(ValueError, theano.gpuarray.fft.cuirfft, inputs, is_odd=123)
--- a/theano/sandbox/fourier.py
+++ b/theano/sandbox/fourier.py
@@ -12,6 +12,14 @@ from six.moves import xrange
 from theano import tensor
 from theano.gof import Op, Apply, generic

+# This module will soon be deprecated.
+import warnings
+
+message = ("The module theano.sandbox.fourier will soon be deprecated."
+           " Please use theano.tensor.fft, which supports gradients and "
+           "automatic optimization transfers to the GPU ops.")
+warnings.warn(message)
+

 class GradTodo(Op):
    # TODO : need description for class

--- a/theano/tensor/fft.py
+++ b/theano/tensor/fft.py
+from __future__ import absolute_import, print_function, division
+import numpy as np
+from theano import gof
+import theano.tensor as T
+from theano.gradient import DisconnectedType
+
+
+class RFFTOp(gof.Op):
+
+    __props__ = ()
+
+    def output_type(self, inp):
+        # add extra dim for real/imag
+        return T.TensorType(inp.dtype,
+                            broadcastable=[False] * (inp.type.ndim + 1))
+
+    def make_node(self, a, s=None):
+        a = T.as_tensor_variable(a)
+        if a.ndim < 2:
+            raise TypeError('%s: input must have dimension > 2, with first dimension batches' %
+                            self.__class__.__name__)
+
+        if s is None:
+            s = a.shape[1:]
+            s = T.as_tensor_variable(s)
+        else:
+            s = T.as_tensor_variable(s)
+            if (not s.dtype.startswith('int')) and \
+               (not s.dtype.startswith('uint')):
+                raise TypeError('%s: length of the transformed axis must be'
+                                ' of type integer' % self.__class__.__name__)
+        return gof.Apply(self, [a, s], [self.output_type(a)()])
+
+    def perform(self, node, inputs, output_storage):
+        a = inputs[0]
+        s = inputs[1]
+
+        A = np.fft.rfftn(a, s=tuple(s))
+        # Format output with two extra dimensions for real and imaginary
+        # parts.
+        out = np.zeros(A.shape + (2,), dtype=a.dtype)
+        out[..., 0], out[..., 1] = np.real(A), np.imag(A)
+        output_storage[0][0] = out
+
+    def grad(self, inputs, output_grads):
+        gout, = output_grads
+        s = inputs[1]
+        # Divide the last dimension of the output gradients by 2, they are
+        # double-counted by the real-IFFT due to symmetry, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gout.ndim - 2) \
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gout = T.set_subtensor(gout[idx], gout[idx] * 0.5)
+        return [irfft_op(gout, s), DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]
+
+rfft_op = RFFTOp()
+
+
+class IRFFTOp(gof.Op):
+
+    __props__ = ()
+
+    def output_type(self, inp):
+        # remove extra dim for real/imag
+        return T.TensorType(inp.dtype,
+                            broadcastable=[False] * (inp.type.ndim - 1))
+
+    def make_node(self, a, s=None):
+        a = T.as_tensor_variable(a)
+        if a.ndim < 3:
+            raise TypeError('%s: input must have dimension >= 3,  with ' %
+                            self.__class__.__name__ +
+                            'first dimension batches and last real/imag parts')
+
+        if s is None:
+            s = a.shape[1:-1]
+            s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+            s = T.as_tensor_variable(s)
+        else:
+            s = T.as_tensor_variable(s)
+            if (not s.dtype.startswith('int')) and \
+               (not s.dtype.startswith('uint')):
+                raise TypeError('%s: length of the transformed axis must be'
+                                ' of type integer' % self.__class__.__name__)
+        return gof.Apply(self, [a, s], [self.output_type(a)()])
+
+    def perform(self, node, inputs, output_storage):
+        a = inputs[0]
+        s = inputs[1]
+
+        # Reconstruct complex array from two float dimensions
+        inp = a[..., 0] + 1j * a[..., 1]
+        out = np.fft.irfftn(inp, s=tuple(s))
+        # Remove numpy's default normalization
+        # Cast to input type (numpy outputs float64 by default)
+        output_storage[0][0] = (out * s.prod()).astype(a.dtype)
+
+    def grad(self, inputs, output_grads):
+        gout, = output_grads
+        s = inputs[1]
+        gf = rfft_op(gout, s)
+        # Multiply the last dimension of the gradient by 2, they represent
+        # both positive and negative frequencies, except the first
+        # and last elements (for even transforms) which are unique.
+        idx = [slice(None)] * (gf.ndim - 2) \
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gf = T.set_subtensor(gf[idx], gf[idx] * 2)
+        return [gf, DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]
+
+irfft_op = IRFFTOp()
+
+
+def rfft(inp, norm=None):
+    """
+    Performs the fast Fourier transform of a real-valued input.
+
+    The input must be a real-valued variable of dimensions (m, ..., n).
+    It performs FFTs of size (..., n) on m batches.
+
+    The output is a tensor of dimensions (m, ..., n//2+1, 2). The second to
+    last dimension of the output contains the n//2+1 non-trivial elements of
+    the real-valued FFTs. The real and imaginary parts are stored as a pair of
+    float arrays.
+
+    Parameters
+    ----------
+    inp
+        Array of floats of size (m, ..., n), containing m inputs of
+        size (..., n).
+    norm : {None, 'ortho', 'no_norm'}
+        Normalization of transform. Following numpy, default *None* normalizes
+        only the inverse transform by n, 'ortho' yields the unitary transform
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
+        the transform unnormalized.
+
+    """
+
+    s = inp.shape[1:]
+    cond_norm = _unitary(norm)
+    scaling = 1
+    if cond_norm == "ortho":
+        scaling = T.sqrt(s.prod().astype(inp.dtype))
+
+    return rfft_op(inp, s) / scaling
+
+
+def irfft(inp, norm=None, is_odd=False):
+    """
+    Performs the inverse fast Fourier Transform with real-valued output.
+
+    The input is a variable of dimensions (m, ..., n//2+1, 2)
+    representing the non-trivial elements of m real-valued Fourier transforms
+    of initial size (..., n). The real and imaginary parts are stored as a
+    pair of float arrays.
+
+    The output is a real-valued variable of dimensions (m, ..., n)
+    giving the m inverse FFTs.
+
+    Parameters
+    ----------
+    inp
+        Array of size (m, ..., n//2+1, 2), containing m inputs
+        with n//2+1 non-trivial elements on the last dimension and real
+        and imaginary parts stored as separate real arrays.
+    norm : {None, 'ortho', 'no_norm'}
+        Normalization of transform. Following numpy, default *None* normalizes
+        only the inverse transform by n, 'ortho' yields the unitary transform
+        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
+        the transform unnormalized.
+    is_odd : {True, False}
+        Set to True to get a real inverse transform output with an odd last dimension
+        of length (N-1)*2 + 1 for an input last dimension of length N.
+
+    """
+
+    if is_odd not in (True, False):
+        raise ValueError("Invalid value %s for id_odd, must be True or False" % is_odd)
+
+    s = inp.shape[1:-1]
+    if is_odd:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
+    else:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+
+    cond_norm = _unitary(norm)
+    scaling = 1
+    # Numpy's default normalization is 1/N on the inverse transform.
+    if cond_norm is None:
+        scaling = s.prod().astype(inp.dtype)
+    elif cond_norm == "ortho":
+        scaling = T.sqrt(s.prod().astype(inp.dtype))
+
+    return irfft_op(inp, s) / scaling
+
+
+def _unitary(norm):
+    if norm not in (None, "ortho", "no_norm"):
+        raise ValueError("Invalid value %s for norm, must be None, 'ortho' or "
+                         "'no norm'" % norm)
+    return norm
--- a/theano/tensor/fourier.py
+++ b/theano/tensor/fourier.py
@@ -6,6 +6,10 @@ from theano import gof, tensor

 class Fourier(gof.Op):
    """
+    WARNING: for officially supported FFTs, use theano.tensor.fft, which
+    provides real-input FFTs. Gradients are supported, as well as optimization
+    transfers to GPU ops.
+
    An instance of this class returns a finite fourier transform calcutated
    along one dimension of an input array.


--- a/theano/tensor/tests/test_fft.py
+++ b/theano/tensor/tests/test_fft.py
+from __future__ import absolute_import, print_function, division
+import numpy
+import unittest
+
+import theano
+from theano import tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tensor import fft
+
+N = 16
+
+
+class TestFFT(unittest.TestCase):
+
+    def test_rfft_float(self):
+        # Test that numpy's default float64 output is cast to theano input type
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return fft.rfft(inp)
+        inputs_val = numpy.random.random((1, N)).astype(theano.config.floatX)
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return fft.irfft(inp)
+        inputs_val = numpy.random.random((1, N // 2 + 1, 2)).astype(theano.config.floatX)
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_1Drfft(self):
+        inputs_val = numpy.random.random((1, N)).astype(theano.config.floatX)
+
+        x = T.matrix('x')
+        rfft = fft.rfft(x)
+        f_rfft = theano.function([x], rfft)
+        res_rfft = f_rfft(inputs_val)
+        res_rfft_comp = (numpy.asarray(res_rfft[:, :, 0]) +
+                         1j * numpy.asarray(res_rfft[:, :, 1]))
+
+        rfft_ref = numpy.fft.rfft(inputs_val, axis=1)
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp)
+
+        m = rfft.type()
+        print(m.ndim)
+        irfft = fft.irfft(m)
+        f_irfft = theano.function([m], irfft)
+        res_irfft = f_irfft(res_rfft)
+
+        utt.assert_allclose(inputs_val, numpy.asarray(res_irfft))
+
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return fft.rfft(inp)
+        inputs_val = numpy.random.random((1, N)).astype(theano.config.floatX)
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return fft.irfft(inp)
+        inputs_val = numpy.random.random((1, N // 2 + 1, 2)).astype(theano.config.floatX)
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_rfft(self):
+        inputs_val = numpy.random.random((1, N, N)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        rfft = fft.rfft(inputs)
+        f_rfft = theano.function([], rfft)
+        res_rfft = f_rfft()
+        res_rfft_comp = (numpy.asarray(res_rfft[:, :, :, 0]) +
+                         1j * numpy.asarray(res_rfft[:, :, :, 1]))
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+    def test_irfft(self):
+        inputs_val = numpy.random.random((1, N, N)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        rfft = fft.rfft(inputs)
+        f_rfft = theano.function([], rfft)
+        res_fft = f_rfft()
+
+        m = rfft.type()
+        irfft = fft.irfft(m)
+        f_irfft = theano.function([m], irfft)
+        res_irfft = f_irfft(res_fft)
+
+        utt.assert_allclose(inputs_val, numpy.asarray(res_irfft))
+
+        inputs_val = numpy.random.random((1, N, N, 2)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        irfft = fft.irfft(inputs)
+        f_irfft = theano.function([], irfft)
+        res_irfft = f_irfft()
+        inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j
+
+        irfft_ref = numpy.fft.irfftn(inputs_ref, axes=(1, 2))
+
+        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
+
+    def test_norm_rfft(self):
+        inputs_val = numpy.random.random((1, N, N)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        # Unitary normalization
+        rfft = fft.rfft(inputs, norm='ortho')
+        f_rfft = theano.function([], rfft)
+        res_rfft = f_rfft()
+        res_rfft_comp = (numpy.asarray(res_rfft[:, :, :, 0]) +
+                         1j * numpy.asarray(res_rfft[:, :, :, 1]))
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref / N, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+        # No normalization
+        rfft = fft.rfft(inputs, norm='no_norm')
+        f_rfft = theano.function([], rfft)
+        res_rfft = f_rfft()
+        res_rfft_comp = (numpy.asarray(res_rfft[:, :, :, 0]) +
+                         1j * numpy.asarray(res_rfft[:, :, :, 1]))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
+        # Inverse FFT inputs
+        inputs_val = numpy.random.random((1, N, N // 2 + 1, 2)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+        inputs_ref = inputs_val[..., 0] + 1j * inputs_val[..., 1]
+
+        # Unitary normalization inverse FFT
+        irfft = fft.irfft(inputs, norm='ortho')
+        f_irfft = theano.function([], irfft)
+        res_irfft = f_irfft()
+
+        irfft_ref = numpy.fft.irfftn(inputs_ref, axes=(1, 2))
+
+        utt.assert_allclose(irfft_ref * N, res_irfft, atol=1e-4, rtol=1e-4)
+
+        # No normalization inverse FFT
+        irfft = fft.irfft(inputs, norm='no_norm')
+        f_irfft = theano.function([], irfft)
+        res_irfft = f_irfft()
+
+        utt.assert_allclose(irfft_ref * N**2, res_irfft, atol=1e-4, rtol=1e-4)
+
+    def test_params(self):
+        inputs_val = numpy.random.random((1, N)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        self.assertRaises(ValueError, fft.rfft, inputs, norm=123)
+
+        inputs_val = numpy.random.random((1, N // 2 + 1, 2)).astype(theano.config.floatX)
+        inputs = theano.shared(inputs_val)
+
+        self.assertRaises(ValueError, fft.irfft, inputs, norm=123)
+        self.assertRaises(ValueError, fft.irfft, inputs, is_odd=123)
+
+    def test_grad_rfft(self):
+        # The numerical gradient of the FFT is sensitive, must set large
+        # enough epsilon to get good accuracy.
+        eps = 1e-1
+
+        def f_rfft(inp):
+            return fft.rfft(inp)
+        inputs_val = numpy.random.random((1, N, N)).astype(theano.config.floatX)
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return fft.irfft(inp)
+        inputs_val = numpy.random.random((1, N, N // 2 + 1, 2)).astype(theano.config.floatX)
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+        def f_rfft(inp):
+            return fft.rfft(inp, norm='ortho')
+        inputs_val = numpy.random.random((1, N, N)).astype(theano.config.floatX)
+        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
+
+        def f_irfft(inp):
+            return fft.irfft(inp, norm='no_norm')
+        inputs_val = numpy.random.random((1, N, N // 2 + 1, 2)).astype(theano.config.floatX)
+        utt.verify_grad(f_irfft, [inputs_val], eps=eps)