Merge pull request #1904 from f0k/fftconv-enhancements

FFT convolution: make optimization support uneven image width

Merge pull request #1904 from f0k/fftconv-enhancements
4eff8975 · Frédéric Bastien · 018c7fa9 · 06572ddf · 4eff8975 · 4eff8975
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -32,7 +32,8 @@ TODO: Give examples for how to use these things! They are pretty complicated.
      to perform the work.  You can enable it by setting
      'THEANO_FLAGS=optimizer_including=conv_fft_valid:conv_fft_full'
      in your environement.  This is not enabled by default because it
-      has some restrictions on input and uses more memory.
+      has some restrictions on input and uses more memory.  Also note
+      that it requires CUDA >= 5.0, scikits.cuda >= 0.5.0 and PyCUDA to run.
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`. Doesn't work on the GPU.
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -662,7 +662,7 @@ class GpuConv(GpuOp):

    def c_compile_args(self):
        nb = 0
-        if self.kshp is not None:
+        if (self.kshp is not None) and (self.kshp[1] is not None):
            nb = self.kshp[1]
        return ['-DTHEANO_KERN_WID=' + str(nb)]  # ,'-g','-G']


--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1119,14 +1119,32 @@ def local_gpu_conv(node):
            return [out]


+def _gpu_conv_to_fftconv(node):
+    # shared helper function for local_conv_fft_valid and local_conv_fft_full.
+    # we import conv2d_fft locally to avoid pycuda warnings
+    from theano.sandbox.cuda.fftconv import conv2d_fft
+    kwargs = {'border_mode': node.op.border_mode}
+    if (node.op.imshp is not None and
+        node.op.imshp[-1] is not None and
+        node.op.imshp[-1] % 2 == 1):
+        kwargs['pad_last_dim'] = True
+    # TODO: If the user supplied the full nonsymbolic image_shape and
+    # filter_shape in conv2d(), we could pass it on to conv2d_fft(). However,
+    # information on batch size and channel counts is currently discarded
+    # when a ConvOp is replaced by a GpuConv, so this would need more changes.
+    #if (node.op.imshp is not None) and (None not in node.op.imshp):
+    #    kwargs['image_shape'] = (bsize, inchannels) + node.op.imshp
+    #if (node.op.kshp is not None) and (None not in node.op.kshp):
+    #    kwargs['filter_shape'] = (outchannels, inchannels) + node.op.kshp
+    return conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
+
+
 @local_optimizer([GpuConv])
 def local_conv_fft_valid(node):
    if (isinstance(node.op, GpuConv) and
        node.op.border_mode == 'valid' and
        node.op.subsample == (1, 1)):
-        # import locally to avoid pycuda warnings
-        from theano.sandbox.cuda.fftconv import conv2d_fft
-        return [conv2d_fft(node.inputs[0], node.inputs[1])]
+        return [_gpu_conv_to_fftconv(node)]


 @local_optimizer([GpuConv])
@@ -1134,9 +1152,7 @@ def local_conv_fft_full(node):
    if (isinstance(node.op, GpuConv) and
        node.op.border_mode == 'full' and
        node.op.subsample == (1, 1)):
-        # import locally to avoid pycuda warnings
-        from theano.sandbox.cuda.fftconv import conv2d_fft
-        return [conv2d_fft(node.inputs[0], node.inputs[1], border_mode='full')]
+        return [_gpu_conv_to_fftconv(node)]

 gpu_optimizer.register("conv_fft_valid", local_conv_fft_valid)
 gpu_optimizer.register("conv_fft_full", local_conv_fft_full)

--- a/theano/sandbox/cuda/tests/test_fftconv.py
+++ b/theano/sandbox/cuda/tests/test_fftconv.py
@@ -11,6 +11,7 @@ if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

 from theano.sandbox.cuda import float32_shared_constructor as shared
+import theano.sandbox.cuda.fftconv

 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')