Merge pull request #2000 from ballasn/3dfftconv

3dfftconv

Merge pull request #2000 from ballasn/3dfftconv
93be9cb8 · abergeron · 9b3ea9e0 · 642f9ece · 93be9cb8 · 93be9cb8
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -28,13 +28,24 @@ TODO: Give examples for how to use these things! They are pretty complicated.
    - :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
    - :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
    - :func:`conv2d_fft <theano.sandbox.cuda.fftconv.conv2d_fft>`
-      This is a GPU-only version of conv2d that uses an FFT transform
+      This is a GPU-only version of nnet.conv2d that uses an FFT transform
-      to perform the work.  You can enable it by setting
+      to perform the work. conv2d_fft should not be used directly as it
+      does not implement a grad function. Instead, you should use
+       nnet.conv2d and enable the fft optimization by setting
      'THEANO_FLAGS=optimizer_including=conv_fft_valid:conv_fft_full'
      in your environement.  This is not enabled by default because it
      has some restrictions on input and uses more memory.  Also note
      that it requires CUDA >= 5.0, scikits.cuda >= 0.5.0 and PyCUDA to run.
-    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`. Doesn't work on the GPU.
+    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
+      3D Convolution. Doesn't work on the GPU.
+    - :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`
+      GPU-only version of conv3D using FFT transform. conv3d_fft should
+      not be call directly as it does not implement a grad function.
+      You can enable it by setting THEANO_FLAGS to
+      'optimizer_including=conv3d_fft:convgrad3d_fft:convtransp3d_fft'
+      This is not enabled by default because it has some restrictions on
+      input and uses more memory. Also note that it requires CUDA >= 5.0,
+      scikits.cuda >= 0.5.0 and PyCUDA to run.
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.
      It is faster in some cases than conv3d, specifically on the GPU.

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -5,6 +5,7 @@ import theano
 import theano.tensor as T
 from theano.sandbox.cuda import cuda_available, GpuOp
+from theano.ifelse import ifelse
 if cuda_available:
    from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
@@ -509,3 +510,156 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
    # output should now be the result of a batched valid convolution
    # of the input with the filters.
    return basic_ops.as_cuda_ndarray_variable(output)
+def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
+               border_mode='valid', pad_last_dim=False):
+    """
+    Perform a convolution through fft.
+    Only supports input whose shape is even on the last dimension.
+    All other dimensions can be anything and the filters can
+    have an even or odd last dimension.
+    The semantics associated with the last three dimensions
+    are not important as long as they are in the same order between
+    the inputs and the filters. For example, when the convolution
+    is done on a sequence of images, they could be either
+    (duration, height, width) or (height, width, duration).
+    If you must use input which has an odd width, you can either pad
+    it or use the `pad_last_dim` argument which will do it for you and
+    take care to strip the padding before returning. pad_last_dim checks
+    that the last dimension is odd before the actual paddding
+    On valid mode the filters must be smaller than the input.
+    input: (b, ic, i0, i1, i2)
+    filters: (oc, ic, f0, f1, i2)
+    border_mode: 'valid' of 'full'
+    pad_last_dim: Unconditionally pad the last dimension of the input
+                  to to turn it from odd to even.  Will strip the
+                  padding before returning the result.
+    """
+    # use symbolic shapes to compute shape info at runtime if not specified
+    if image_shape is None:
+        image_shape = input.shape
+    if filter_shape is None:
+        filter_shape = filters.shape
+    # batch size, input channels, input dim 0, input dim 1
+    b, ic, i0, i1, i2 = image_shape
+    # output channels, input channels, filter dim 0, filter dim 1
+    oc, ic_, f0, f1, f2 = filter_shape
+    # Check that the last dimension is odd
+    is_odd = T.eq(T.mod(input.shape[4], 2), 1)
+    # pad filters/image to output shape
+    if border_mode == 'valid':
+        o0 = i0
+        o1 = i1
+        o2 = i2
+        input_padded = input
+        if pad_last_dim:
+            o2 = ifelse(is_odd, o2 + 1, o2)
+            input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
+            input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1, :i2],
+                                           input)
+        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
+        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
+                                         filters)
+    elif border_mode == 'full':
+        # In this particular case, the values of (o0, o1) represent
+        # the dimensions of the work buffer more than the actual dimensions
+        # of the desired output.
+        o0 = i0 + 2 * (f0 - 1)
+        o1 = i1 + 2 * (f1 - 1)
+        o2 = i2 + 2 * (f2 - 1)
+        if pad_last_dim:
+            o2 = ifelse(is_odd, o2 + 1, o2)
+        # We line up the filters and the images in a way
+        # such that the filters are tightly placed against the
+        # top-left of the array, and the images intersect with
+        # them on one pixel. The top-left pixel of the images
+        # is the bottom-right pixel of the filters when we
+        # do the layout here.
+        filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
+        filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
+                                         filters)
+        input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
+        input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1), (f2 - 1):(f2 - 1 + i2)],
+                                       input)
+    else:
+        raise ValueError('invalid mode')
+    # reshape for FFT
+    input_flat = input_padded.reshape((b * ic, o0, o1, o2))
+    filters_flat = filters_padded.reshape((oc * ic, o0, o1, o2))
+    # perform FFT
+    input_fft_flat = cufft(input_flat)  # (b * ic, o0, o1, o2//2 + 1, 2)
+    filters_fft_flat = cufft(filters_flat)  # (oc * ic, o0, o1, o2//2 + 1, 2)
+    # Unfold ic dimension.
+    # We have to collapse two dimensions together
+    # in order to reuse the same `mult_and_reduce`.
+    # This explains the o0 * 01 instead of just keeping
+    # the two dimensions intact.
+    input_fft_v_shape = (b, ic, o0 * o1, o2 // 2 + 1, 2)
+    filters_fft_v_shape = (oc, ic, o0 * o1, o2 // 2 + 1, 2)
+    input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
+    filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
+    # (b, oc, o0 * o1, o2//2 + 1, 2)
+    output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
+                                   input_shape=input_fft_v_shape,
+                                   filter_shape=filters_fft_v_shape)
+    #output_fft_s = input_fft_v
+    # reshape for IFFT
+    output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
+    # perform IFFT
+    output_flat = cuifft(output_fft_flat)  # (b * oc, o0, o1, o2)
+    # reshape
+    output_circ = output_flat.reshape((b, oc, o0, o1, o2))  # circular!
+    # Now we extract the region of interest.
+    # We just cut it out from the output_circ
+    # array that was used for the computation.
+    # We do not need to handle pad_last_dim in a
+    # special way because we specify explicitly here
+    # how much values are expected.
+    if border_mode == 'valid':
+        output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
+    elif border_mode == 'full':
+        output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
+    else:
+        raise ValueError('invalid mode')
+    #output = output_circ[:, :, :, :, :]
+    # Rescale manually. This is just a factor that comes in during the
+    # trip through FFT and inverse FFT.
+    output = (1.0 / T.cast(o0 * o1 * o2, 'float32')) * output
+    # output should now be the result of a batched valid convolution
+    # of the input with the filters.
+    return basic_ops.as_cuda_ndarray_variable(output)
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1256,6 +1256,87 @@ def local_conv_fft_full(node):
 gpu_optimizer.register("conv_fft_valid", local_conv_fft_valid)
 gpu_optimizer.register("conv_fft_full", local_conv_fft_full)
+from theano.tensor.nnet.Conv3D import Conv3D
+@local_optimizer([Conv3D])
+def local_conv3d_fft(node):
+    try:
+        stride_x = tensor.get_scalar_constant_value(node.inputs[3][0])
+        stride_y = tensor.get_scalar_constant_value(node.inputs[3][1])
+        stride_z = tensor.get_scalar_constant_value(node.inputs[3][2])
+    except tensor.NotScalarConstantError:
+        return False
+    if (isinstance(node.op, Conv3D) and
+        (stride_x, stride_y, stride_z) == (1, 1, 1)):
+        # we import conv3d_fft locally to avoid pycuda warnings
+        from theano.sandbox.cuda.fftconv import conv3d_fft
+        # Shuffle inputs signal from (b, 0, 1, t, c) to (b, c, 0, 1, t)
+        x = node.inputs[0]
+        x = gpu_from_host(x.dimshuffle(0, 4, 1, 2, 3))
+        # Shuffle filters from (oc, 0, 1, t, ic) to (oc, ic, 0, 1, t)
+        f = node.inputs[1]
+        f = gpu_from_host(f.dimshuffle(0, 4, 1, 2, 3))
+        # filter flip
+        f = f[:,:,::-1,::-1,::-1]
+        rval = conv3d_fft(x, f, border_mode='valid', pad_last_dim=True)
+        # Shuffle from (oc, c, 0, 1, t) to (oc, 0, 1, t, c)
+        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[2]]
+gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
+from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
+@local_optimizer([ConvGrad3D])
+def local_convgrad3d_fft(node):
+    try:
+        stride_x = tensor.get_scalar_constant_value(node.inputs[1][0])
+        stride_y = tensor.get_scalar_constant_value(node.inputs[1][1])
+        stride_z = tensor.get_scalar_constant_value(node.inputs[1][2])
+    except tensor.NotScalarConstantError:
+        return False
+    if (isinstance(node.op, ConvGrad3D) and
+        (stride_x, stride_y, stride_z) == (1, 1, 1)):
+        # we import conv3d_fft locally to avoid pycuda warnings
+        from theano.sandbox.cuda.fftconv import conv3d_fft
+        # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
+        x = node.inputs[0]
+        x = x.dimshuffle(4, 0, 1, 2, 3)
+        # Shuffle dCdH from (b, 0, 1, t, oc) to (oc, b, 0, 1, t)
+        f = node.inputs[3]
+        f = f.dimshuffle(4, 0, 1, 2, 3)
+        # filter flip
+        f = f[:,:,::-1,::-1,::-1]
+        rval = conv3d_fft(x, f, border_mode='valid', pad_last_dim=True)
+        # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
+        return [rval.dimshuffle(1, 2, 3, 4, 0)]
+gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
+from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
+@local_optimizer([ConvTransp3D])
+def local_convtransp3d_fft(node):
+    try:
+        stride_x = tensor.get_scalar_constant_value(node.inputs[2][0])
+        stride_y = tensor.get_scalar_constant_value(node.inputs[2][1])
+        stride_z = tensor.get_scalar_constant_value(node.inputs[2][2])
+    except tensor.NotScalarConstantError:
+        return False
+    if (isinstance(node.op, ConvTransp3D) and
+        (stride_x, stride_y, stride_z) == (1, 1, 1)):
+        # we import conv3d_fft locally to avoid pycuda warnings
+        from theano.sandbox.cuda.fftconv import conv3d_fft
+        # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
+        x = node.inputs[0]
+        x = x.dimshuffle(4, 0, 1, 2, 3)
+        # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
+        f = node.inputs[3]
+        f = f.dimshuffle(0, 4, 1, 2, 3)
+        rval = conv3d_fft(f, x, border_mode='full', pad_last_dim=True)
+        # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
+        return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
+gpu_optimizer.register("convtransp3d_fft", local_convtransp3d_fft)
 import theano.tensor.signal.downsample as downsample

--- a/theano/sandbox/cuda/tests/test_fftconv.py
+++ b/theano/sandbox/cuda/tests/test_fftconv.py
@@ -118,3 +118,166 @@ class TestConv2dFFT(unittest.TestCase):
        res_fft = f_fft()
        utt.assert_allclose(res_ref, res_fft)
+class TestConv3dFFT(unittest.TestCase):
+    def run_conv_valid(self, inputs_shape, filters_shape, pad=False):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        # Flip filter as conv3D compute correlation
+        filters_flip = filters[:,::-1,::-1,::-1,:]
+        #filters_flip = filters
+        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters_flip,
+                                             b=bias, d=(1,1,1))
+        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs.dimshuffle(0, 4, 1, 2, 3),
+                                                          filters.dimshuffle(0, 4, 1, 2, 3),
+                                                          border_mode = "valid",
+                                                          pad_last_dim = pad)
+        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)
+        f_ref = theano.function([], conv_ref)
+        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res_fft = f_fft()
+        utt.assert_allclose(res_ref, res_fft,  rtol=1e-05, atol=1e-05)
+    def run_conv_full(self, inputs_shape, filters_shape, pad=False):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
+        conv_ref = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1,1,1),
+                                                   H=inputs)
+        filters = filters.dimshuffle(4, 0, 1, 2, 3)
+        inputs = inputs.dimshuffle(0, 4, 1, 2, 3)
+        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs, filters,
+                                                          border_mode = "full",
+                                                          pad_last_dim = pad)
+        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)
+        f_ref = theano.function([], conv_ref)
+        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res_fft = f_fft()
+        utt.assert_allclose(res_ref, res_fft,  rtol=1e-04, atol=1e-04)
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            pad=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 32, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            pad=True)
+    def test_full(self):
+        self.run_conv_full(inputs_shape=(16, 15, 21, 16, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           pad=True)
+        self.run_conv_full(inputs_shape=(16, 15, 21, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           pad=True)
+    def test_opt_conv3d_fft(self):
+        inputs_shape = (16, 20, 32, 16, 1)
+        filters_shape = (10, 6, 12, 4, 1)
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+        conv = theano.tensor.nnet.conv3D(V=inputs, W=filters,
+                                         b=bias, d=(1,1,1))
+        mode = mode_with_gpu.including('conv3d_fft')
+        f_ref = theano.function([], conv)
+        f_fft = theano.function([], conv, mode=mode)
+        # make sure we inserted the fft trickery
+        topo = f_fft.maker.fgraph.toposort()
+        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
+                   for n in topo) == 2
+        res_ref = f_ref()
+        res_fft = f_fft()
+        utt.assert_allclose(res_ref, res_fft)
+    def test_opt_convgrad3d_fft(self):
+        inputs_shape = (16, 20, 32, 16, 1)
+        filters_shape = (10, 6, 12, 4, 1)
+        dCdH_shape = (16, 15, 21, 13, 10)
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs = shared(inputs_val)
+        dCdH = shared(dCdH_val)
+        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
+                                             WShape=filters_shape,
+                                             d=(1,1,1))
+        mode = mode_with_gpu.including('convgrad3d_fft')
+        f_ref = theano.function([], conv)
+        f_fft = theano.function([], conv, mode=mode)
+        # make sure we inserted the fft trickery
+        topo = f_fft.maker.fgraph.toposort()
+        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
+                   for n in topo) == 2
+        res_ref = f_ref()
+        res_fft = f_fft()
+        utt.assert_allclose(res_ref, res_fft,  rtol=1e-04, atol=1e-04)
+    def test_opt_convtransp3d_fft(self):
+        inputs_shape = (16, 15, 21, 12, 10)
+        filters_shape = (10, 6, 12, 4, 1)
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1,1,1),
+                                               H=inputs)
+        mode = mode_with_gpu.including('convtransp3d_fft')
+        f_ref = theano.function([], conv)
+        f_fft = theano.function([], conv, mode=mode)
+        # make sure we inserted the fft trickery
+        topo = f_fft.maker.fgraph.toposort()
+        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
+                   for n in topo) == 2
+        res_ref = f_ref()
+        res_fft = f_fft()
+        utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)