final fft ops take shape as input, interface passes input array shape + odd correction

0a889321 · slefrancois · 29271d0a · 0a889321 · 0a889321
--- a/theano/gpuarray/fft.py
+++ b/theano/gpuarray/fft.py
@@ -38,7 +38,14 @@ class CuRFFTOp(Op):
                            broadcastable=[False] * (inp.type.ndim + 1),
                            context_name=inp.type.context_name)

-    def make_node(self, inp):
+    def make_node(self, inp, s=None):
+        # A shape parameter s can be provided as an input. For now this is used to
+        # manage odd transform sizes.
+        # Later this could be extended to handle padding and trunkation,
+        # following numpy's interface. However, cuFFT expects array that match
+        # the shape given to the plan, so padding will have to be done in the op.
+        # The effect of padding on gradients has yet to be investigated.
+
        if not scikits_cuda_available:
            raise RuntimeError("scikits.cuda is needed for CuFFTOp")

@@ -52,9 +59,16 @@ class CuRFFTOp(Op):
            basic_ops.as_gpuarray_variable(inp,
                                           basic_ops.infer_context_name(inp)))

+        # If no shape is provided as input, default to input data shape.
+        if s is None:
+            s = inp.shape[1:]
+        s = T.as_tensor_variable(s)
+
        assert inp.dtype == "float32"
+        assert s.ndim == 1
+        assert 'int' in s.dtype

-        return theano.Apply(self, [inp], [self.output_type(inp)()])
+        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

    def make_thunk(self, node, storage_map, _, _2):

@@ -70,9 +84,13 @@ class CuRFFTOp(Op):

        def thunk():
            input_shape = inputs[0][0].shape
+            s = inputs[1][0]
+
+            # Since padding is not supported, assert s matches input shape.
+            assert (input_shape[1:] == s).all()

            # construct output shape
-            output_shape = list(input_shape)
+            output_shape = [input_shape[0]] + list(s)
            # DFT of real input is symmetric, no need to store
            # redundant coefficients
            output_shape[-1] = output_shape[-1] // 2 + 1
@@ -99,13 +117,15 @@ class CuRFFTOp(Op):
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
+                    plan[0] = fft.Plan(s, np.float32, np.complex64,
                                       batch=input_shape[0])
+
                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()

                fft.fft(input_pycuda, output_pycuda, plan[0])
+
                # Sync results to ensure output contains completed computation
                pycuda.driver.Context.synchronize()

@@ -117,15 +137,18 @@ class CuRFFTOp(Op):

    def grad(self, inputs, output_grads):
        gout, = output_grads
-        s = inputs[0].shape[1:]
-        is_odd = s[-1] % 2
-        # Divide the last dimension of the output gradients by 2, they are 
+        s = inputs[1]
+        # Divide the last dimension of the output gradients by 2, they are
        # double-counted by the real-IFFT due to symmetry, except the first
        # and last elements (for even transforms) which are unique.
        idx = [slice(None)] * (gout.ndim - 2) \
-                + [slice(1, (s[-1] // 2) + is_odd)] + [slice(None)]
-        gout = T.set_subtensor(gout[idx], gout[idx]*0.5) 
-        return [cuirfft_op(gout, is_odd)]
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gout = T.set_subtensor(gout[idx], gout[idx] * 0.5)
+        return [cuirfft_op(gout, s), DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]

 curfft_op = CuRFFTOp()

@@ -135,12 +158,19 @@ class CuIRFFTOp(Op):
    __props__ = ()

    def output_type(self, inp):
-        # add one extra dim for real/imag
+        # remove extra dim for real/imag
        return GpuArrayType(inp.dtype,
                            broadcastable=[False] * (inp.type.ndim - 1),
                            context_name=inp.type.context_name)

-    def make_node(self, inp, is_odd):
+    def make_node(self, inp, s=None):
+        # A shape parameter is expected as an input. For now this is used to
+        # manage odd transform sizes.
+        # Later this could be extended to handle padding and trunkation,
+        # following numpy's interface. However, cuFFT expects array that match
+        # the shape given to the plan, so padding will have to be done in the op.
+        # The effect of padding on gradients has yet to be investigated.
+
        if not scikits_cuda_available:
            raise RuntimeError("scikits.cuda is needed for CuIFFTOp")

@@ -153,12 +183,17 @@ class CuIRFFTOp(Op):
        inp = basic_ops.gpu_contiguous(
            basic_ops.as_gpuarray_variable(inp,
                                           basic_ops.infer_context_name(inp)))
-        is_odd = T.as_tensor_variable(is_odd)
+
+        # If no shape is provided as input, calculate shape assuming even real transform.
+        if s is None:
+            s = inp.shape[1:-1]
+            s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
+        s = T.as_tensor_variable(s)

        assert inp.dtype == "float32"
-        assert 'int' in is_odd.dtype
+        assert s.ndim == 1

-        return theano.Apply(self, [inp, is_odd], [self.output_type(inp)()])
+        return theano.Apply(self, [inp, s], [self.output_type(inp)()])

    def make_thunk(self, node, storage_map, _, _2):

@@ -174,16 +209,18 @@ class CuIRFFTOp(Op):

        def thunk():
            input_shape = inputs[0][0].shape
-            is_odd = inputs[1][0]
-            assert is_odd in (0, 1)
-            
+            s = inputs[1][0]
+
+            # Since padding is not supported, assert that last dimension corresponds to
+            # input forward transform size.
+            assert (input_shape[1:-2] == s[:-1]).all()
+            assert ((input_shape[-2] - 1) * 2 + s[-1] % 2 == s[-1]).all()
+
            # construct output shape
            # chop off the extra length-2 dimension for real/imag
-            output_shape = list(input_shape[:-1])
-            # restore full signal length
-            output_shape[-1] = (output_shape[-1] - 1) * 2 + is_odd
+            output_shape = [input_shape[0]] + list(s)
            output_shape = tuple(output_shape)
-            
+
            z = outputs[0]

            # only allocate if there is no previous allocation of the
@@ -202,8 +239,9 @@ class CuIRFFTOp(Op):
                # only initialise plan if necessary
                if plan[0] is None or plan_input_shape[0] != input_shape:
                    plan_input_shape[0] = input_shape
-                    plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
+                    plan[0] = fft.Plan(s, np.complex64, np.float32,
                                       batch=output_shape[0])
+
                # Sync GPU variables before computation
                input_pycuda.sync()
                output_pycuda.sync()
@@ -221,33 +259,35 @@ class CuIRFFTOp(Op):
        thunk.lazy = False

        return thunk
-        
+
    def grad(self, inputs, output_grads):
        gout, = output_grads
-        s = gout.shape
-        gf = curfft_op(gout)
+        s = inputs[1]
+        gf = curfft_op(gout, s)
        # Multiply the last dimension of the gradient by 2, they represent
        # both positive and negative frequencies, except the first
        # and last elements (for even transforms) which are unique.
        idx = [slice(None)] * (gf.ndim - 2) \
-                + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
-        gf = T.set_subtensor(gf[idx], gf[idx]*2)
+            + [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
+        gf = T.set_subtensor(gf[idx], gf[idx] * 2)
        return [gf, DisconnectedType()()]

    def connection_pattern(self, node):
-        return [[True],[False]]
+        # Specificy that shape input parameter has no connection to graph and gradients.
+        return [[True], [False]]

 cuirfft_op = CuIRFFTOp()

+
 def curfft(inp, norm=None):
    """
-    Performs the fast Fourier transform of a real-valued output on the GPU 
+    Performs the fast Fourier transform of a real-valued output on the GPU
    through the gpuarray backend.

    The input must be a real-valued float32 variable of dimensions (m, ..., n).
    It performs FFTs of size (..., n) on m batches.

-    The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to 
+    The output is a GpuArray of dimensions (m, ..., n//2+1, 2). The second to
    last dimension of the output contains the n//2+1 non-trivial elements of
    the real-valued FFTs. The real and imaginary parts are stored as two
    float32 arrays, emulating complex64. Since theano does not support complex
@@ -269,14 +309,14 @@ def curfft(inp, norm=None):

    s = inp.shape[1:]
    cond_norm = _unitary(norm)
-    if cond_norm is None or cond_norm == "no_norm":
-        scaling = 1
-    elif cond_norm == "ortho":
+    scaling = 1
+    if cond_norm == "ortho":
        scaling = T.sqrt(s.prod().astype('float32'))
-    
-    return curfft_op(inp) / scaling                                  

-def cuirfft(inp, norm=None, is_odd=0):
+    return curfft_op(inp, s) / scaling
+
+
+def cuirfft(inp, norm=None, is_odd=False):
    """
    Performs the real-valued output inverse Fourier Transform using the
    gpuarray backend.
@@ -288,37 +328,42 @@ def cuirfft(inp, norm=None, is_odd=0):
    given that Theano does not support complex numbers.

    The output is a real-valued float32 variable of dimensions (m, ..., n)
-    giving the m inverse FFTs. 
+    giving the m inverse FFTs.

    Parameters
    ----------
    inp
-        Array of float32 of size (m, ..., n//2+1, 2), containing m inputs 
-        with n/2+1 non-trivial elements on the last dimension and real
+        Array of float32 of size (m, ..., n//2+1, 2), containing m inputs
+        with n//2+1 non-trivial elements on the last dimension and real
        and imaginary parts stored as separate arrays.
    norm : {None, 'ortho', 'no_norm'}
        Normalization of transform. Following numpy, default *None* normalizes
        only the inverse transform by n, 'ortho' yields the unitary transform
        (:math:`1/\sqrt n` forward and inverse). In addition, 'no_norm' leaves
        the transform unnormalized.
-        
+    is_odd : {True, False}
+        Set to True to get a real inverse transform output with an odd last dimension
+        of length (N-1)*2 + 1 for an input last dimension of length N.
    """

-    if is_odd != 0:
-        is_odd = 1
-    
+    if is_odd not in (True, False):
+        raise ValueError("Invalid value %s for id_odd, must be True or False" % is_odd)
+
    s = inp.shape[1:-1]
-    s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + is_odd)
+    if is_odd:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
+    else:
+        s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)

    cond_norm = _unitary(norm)
+    scaling = 1
    if cond_norm is None:
        scaling = s.prod().astype('float32')
-    if cond_norm == "ortho":
+    elif cond_norm == "ortho":
        scaling = T.sqrt(s.prod().astype('float32'))
-    if cond_norm == "no_norm":
-        scaling = 1

-    return cuirfft_op(inp, is_odd) / scaling
+    return cuirfft_op(inp, s) / scaling
+

 def _unitary(norm):
    if norm not in (None, "ortho", "no_norm"):

--- a/theano/gpuarray/tests/test_fft.py
+++ b/theano/gpuarray/tests/test_fft.py
@@ -25,14 +25,13 @@ if not pycuda_available:  # noqa
 import theano.gpuarray.cuda_fft

 # Transform sizes
-N = 16
+N = 64


 class TestFFT(unittest.TestCase):

    def test_1Dfft(self):
        inputs_val = np.random.random((1, N)).astype('float32')
-        inputs = theano.shared(inputs_val)

        x = T.matrix('x', dtype='float32')
        rfft = theano.gpuarray.fft.curfft(x)
@@ -40,200 +39,215 @@ class TestFFT(unittest.TestCase):
        res_rfft = f_rfft(inputs_val)
        res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, 1]))
-    
+
        rfft_ref = numpy.fft.rfft(inputs_val, axis=1)
-    
-        utt.assert_allclose(rfft_ref, res_rfft_comp)        
-        
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp)
+
        m = rfft.type()
        irfft = theano.gpuarray.fft.cuirfft(m)
        f_irfft = theano.function([m], irfft, mode=mode_with_gpu)
        res_irfft = f_irfft(res_rfft)
-        
+
        utt.assert_allclose(inputs_val, np.asarray(res_irfft))
-        
+
        # The numerical gradient of the FFT is sensitive, must set large
        # enough epsilon to get good accuracy.
        eps = 1e-1
-        
+
        def f_rfft(inp):
            return theano.gpuarray.fft.curfft(inp)
        inputs_val = np.random.random((1, N)).astype('float32')
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
-        
+
        def f_irfft(inp):
            return theano.gpuarray.fft.cuirfft(inp)
-        inputs_val = np.random.random((1, N//2+1, 2)).astype('float32')
+        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype('float32')
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
-    
+
    def test_rfft(self):
        inputs_val = np.random.random((1, N, N)).astype('float32')
        inputs = theano.shared(inputs_val)
-    
+
        rfft = theano.gpuarray.fft.curfft(inputs)
        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
        res_rfft = f_rfft()
        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, :, 1]))
-    
-        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1,2))
-    
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, axes=(1, 2))
+
        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
-    
+
    def test_irfft(self):
        inputs_val = np.random.random((1, N, N)).astype('float32')
        inputs = theano.shared(inputs_val)
-    
+
        fft = theano.gpuarray.fft.curfft(inputs)
        f_fft = theano.function([], fft, mode=mode_with_gpu)
        res_fft = f_fft()
-    
+
        m = fft.type()
        ifft = theano.gpuarray.fft.cuirfft(m)
        f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
        res_ifft = f_ifft(res_fft)
-    
+
        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
-    
+
    def test_type(self):
        inputs_val = np.random.random((1, N)).astype('float64')
        inputs = theano.shared(inputs_val)
-    
+
        with self.assertRaises(AssertionError):
            theano.gpuarray.fft.curfft(inputs)
        with self.assertRaises(AssertionError):
            theano.gpuarray.fft.cuirfft(inputs)
-    
+
    def test_norm(self):
        inputs_val = np.random.random((1, N, N)).astype('float32')
        inputs = theano.shared(inputs_val)
-    
+
        # Unitary normalization
        rfft = theano.gpuarray.fft.curfft(inputs, norm='ortho')
        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
        res_rfft = f_rfft()
        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, :, 1]))
-    
-        rfft_ref_ortho = numpy.fft.rfftn(inputs_val, axes=(1,2), norm='ortho')
-    
+
+        rfft_ref_ortho = numpy.fft.rfftn(inputs_val, axes=(1, 2), norm='ortho')
+
        utt.assert_allclose(rfft_ref_ortho, res_rfft_comp,
-        atol=1e-4, rtol=1e-4)
-    
+                            atol=1e-4, rtol=1e-4)
+
        # No normalization
        rfft = theano.gpuarray.fft.curfft(inputs, norm='no_norm')
        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
        res_rfft = f_rfft()
        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, :, 1]))
-    
-        utt.assert_allclose(rfft_ref_ortho * np.sqrt(N*N),
-            res_rfft_comp, atol=1e-4, rtol=1e-4)
-    
+
+        utt.assert_allclose(rfft_ref_ortho * np.sqrt(N * N),
+                            res_rfft_comp, atol=1e-4, rtol=1e-4)
+
        # Inverse FFT inputs
        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
        inputs = theano.shared(inputs_val)
        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
-    
+
        # Unitary normalization inverse FFT
        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho')
        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
        res_irfft = f_irfft()
-    
-        irfft_ref_ortho = numpy.fft.irfftn(inputs_ref, axes=(1,2), norm='ortho')
-    
+
+        irfft_ref_ortho = numpy.fft.irfftn(
+            inputs_ref, axes=(1, 2), norm='ortho')
+
        utt.assert_allclose(irfft_ref_ortho,
-        res_irfft, atol=1e-4, rtol=1e-4)
-    
+                            res_irfft, atol=1e-4, rtol=1e-4)
+
        # No normalization inverse FFT
        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='no_norm')
        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
        res_irfft = f_irfft()
-    
-        utt.assert_allclose(irfft_ref_ortho * np.sqrt(N*N),
-            res_irfft, atol=1e-4, rtol=1e-4)
-    
+
+        utt.assert_allclose(irfft_ref_ortho * np.sqrt(N * N),
+                            res_irfft, atol=1e-4, rtol=1e-4)
+
    def test_grad(self):
        # The numerical gradient of the FFT is sensitive, must set large
        # enough epsilon to get good accuracy.
        eps = 1e-1
-    
+
        def f_rfft(inp):
            return theano.gpuarray.fft.curfft(inp)
        inputs_val = np.random.random((1, N, N)).astype('float32')
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
-    
+
        def f_irfft(inp):
            return theano.gpuarray.fft.cuirfft(inp)
        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
-    
+
        def f_rfft(inp):
            return theano.gpuarray.fft.curfft(inp, norm='ortho')
        inputs_val = np.random.random((1, N, N)).astype('float32')
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
-    
+
        def f_irfft(inp):
            return theano.gpuarray.fft.cuirfft(inp, norm='no_norm')
        inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
-    
+
    def test_odd(self):
        M = N - 1
-    
+
        inputs_val = np.random.random((1, M, M)).astype('float32')
        inputs = theano.shared(inputs_val)
-    
+
        rfft = theano.gpuarray.fft.curfft(inputs)
        f_rfft = theano.function([], rfft, mode=mode_with_gpu)
        res_rfft = f_rfft()
-    
+
        res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, :, 1]))
-    
-        rfft_ref = numpy.fft.rfftn(inputs_val, s=(M,M), axes=(1,2))#, s=(M, M), axes=(1,2))
-    
-        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4) 
-    
+
+        rfft_ref = numpy.fft.rfftn(inputs_val, s=(M, M), axes=(1, 2))
+
+        utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
+
        m = rfft.type()
        ifft = theano.gpuarray.fft.cuirfft(m, is_odd=True)
        f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
        res_ifft = f_ifft(res_rfft)
-    
+
        utt.assert_allclose(inputs_val, np.asarray(res_ifft))
-        
-        inputs_val = np.random.random((1, M, M//2+1, 2)).astype('float32')
+
+        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
        inputs = theano.shared(inputs_val)
-        
+
        irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho', is_odd=True)
        f_irfft = theano.function([], irfft, mode=mode_with_gpu)
        res_irfft = f_irfft()
-        
+
        inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
-        irfft_ref = numpy.fft.irfftn(inputs_ref, s=(M, M), axes=(1,2), norm='ortho')
-        
+        irfft_ref = numpy.fft.irfftn(
+            inputs_ref, s=(M, M), axes=(1, 2), norm='ortho')
+
        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
-        
+
        # The numerical gradient of the FFT is sensitive, must set large
        # enough epsilon to get good accuracy.
        eps = 1e-1
-    
+
        def f_rfft(inp):
            return theano.gpuarray.fft.curfft(inp)
        inputs_val = np.random.random((1, M, M)).astype('float32')
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
-    
+
        def f_irfft(inp):
            return theano.gpuarray.fft.cuirfft(inp, is_odd=True)
        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
-        
+
        def f_rfft(inp):
            return theano.gpuarray.fft.curfft(inp, norm='ortho')
        inputs_val = np.random.random((1, M, M)).astype('float32')
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)
-    
+
        def f_irfft(inp):
            return theano.gpuarray.fft.cuirfft(inp, norm='no_norm', is_odd=True)
        inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
+
+    def test_params(self):
+        inputs_val = np.random.random((1, N)).astype('float32')
+        inputs = theano.shared(inputs_val)
+        with self.assertRaises(ValueError):
+            theano.gpuarray.fft.curfft(inputs, norm=123)
+
+        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype('float32')
+        inputs = theano.shared(inputs_val)
+        with self.assertRaises(ValueError):
+            theano.gpuarray.fft.cuirfft(inputs, norm=123)
+        with self.assertRaises(ValueError):
+            theano.gpuarray.fft.cuirfft(inputs, is_odd=123)