Merge pull request #1870 from abergeron/cuda_fftconv

Cuda fftconv

Merge pull request #1870 from abergeron/cuda_fftconv
75550055 · Frédéric Bastien · 5f219fca · 4cf06d2b · 75550055 · 75550055
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -27,6 +27,12 @@ TODO: Give examples for how to use these things! They are pretty complicated.
 - Conv implemented
    - :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
    - :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
+    - :func:`conv2d_fft <theano.sandbox.cuda.fftconv.conv2d_fft>`
+      This is a GPU-only version of conv2d that uses an FFT transform
+      to perform the work.  You can enable it by setting
+      'THEANO_FLAGS=optimizer_including=conv_fft_valid:conv_fft_full'
+      in your environement.  This is not enabled by default because it
+      has some restrictions on input and uses more memory.
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`. Doesn't work on the GPU.
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.

--- a/theano/misc/pycuda_utils.py
+++ b/theano/misc/pycuda_utils.py
 import numpy
 import pycuda.gpuarray

-import theano.sandbox.cuda as cuda
+from theano.sandbox import cuda
 if cuda.cuda_available == False:
    raise ImportError('Optional theano package cuda disabled')


--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -33,7 +33,6 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))

-
 #is_nvcc_available called here to initialize global vars in
 #nvcc_compiler module
 nvcc_compiler.is_nvcc_available()

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -40,6 +40,7 @@ from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
 from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
+from theano.sandbox.cuda.fftconv import conv2d_fft
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
 linalg = None
@@ -1118,8 +1119,27 @@ def local_gpu_conv(node):
            # differently then the gpu ConvOp
            return [out]

-import theano.tensor.signal.downsample as downsample

+@local_optimizer([GpuConv])
+def local_conv_fft_valid(node):
+    if (isinstance(node.op, GpuConv) and
+        node.op.border_mode == 'valid' and
+        node.op.subsample == (1, 1)):
+        return [conv2d_fft(node.inputs[0], node.inputs[1])]
+
+
+@local_optimizer([GpuConv])
+def local_conv_fft_full(node):
+    if (isinstance(node.op, GpuConv) and
+        node.op.border_mode == 'full' and
+        node.op.subsample == (1, 1)):
+        return [conv2d_fft(node.inputs[0], node.inputs[1], border_mode='full')]
+
+gpu_optimizer.register("conv_fft_valid", local_conv_fft_valid)
+gpu_optimizer.register("conv_fft_full", local_conv_fft_full)
+
+
+import theano.tensor.signal.downsample as downsample

 @register_opt()
 @local_optimizer([downsample.DownsampleFactorMax])

--- a/theano/sandbox/cuda/tests/test_fftconv.py
+++ b/theano/sandbox/cuda/tests/test_fftconv.py
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+
+from theano.sandbox.cuda import float32_shared_constructor as shared
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+
+class TestConv2dFFT(unittest.TestCase):
+    def run_conv(self, inputs_shape, filters_shape, pad=False, **other_args):
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+
+        conv_ref = theano.tensor.nnet.conv.conv2d(inputs, filters,
+                                                  **other_args)
+        conv_fft = theano.sandbox.cuda.fftconv.conv2d_fft(inputs, filters,
+                                                          pad_last_dim=pad,
+                                                          **other_args)
+
+        f_ref = theano.function([], conv_ref)
+        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)
+
+        res_ref = f_ref()
+        res_fft = f_fft()
+
+        utt.assert_allclose(res_ref, res_fft)
+
+    def test_valid(self):
+        self.run_conv(inputs_shape=(5, 3, 7, 6),
+                      filters_shape=(2, 3, 3, 3),
+                      border_mode='valid')
+        self.run_conv(inputs_shape=(5, 3, 7, 7),
+                      filters_shape=(2, 3, 3, 3),
+                      border_mode='valid', pad=True)
+
+    def test_full(self):
+        self.run_conv(inputs_shape=(5, 3, 7, 6),
+                      filters_shape=(2, 3, 3, 3),
+                      border_mode='full')
+        self.run_conv(inputs_shape=(5, 3, 7, 7),
+                      filters_shape=(2, 3, 3, 3),
+                      border_mode='full', pad=True)
+
+    def test_opt_valid(self):
+        inputs_shape = (5, 3, 7, 6)
+        filters_shape = (2, 3, 3, 3)
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+
+        conv = theano.tensor.nnet.conv.conv2d(inputs, filters)
+
+        mode = mode_with_gpu.including('conv_fft_valid')
+
+        f_ref = theano.function([], conv)
+        f_fft = theano.function([], conv, mode=mode)
+
+        # make sure we inserted the fft trickery
+        topo = f_fft.maker.fgraph.toposort()
+        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
+                   for n in topo) == 2
+
+
+        res_ref = f_ref()
+        res_fft = f_fft()
+
+        utt.assert_allclose(res_ref, res_fft)
+
+    def test_opt_full(self):
+        inputs_shape = (5, 3, 7, 6)
+        filters_shape = (2, 3, 3, 3)
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+
+        conv = theano.tensor.nnet.conv.conv2d(inputs, filters,
+                                              border_mode='full')
+
+        mode = mode_with_gpu.including('conv_fft_full')
+
+        f_ref = theano.function([], conv)
+        f_fft = theano.function([], conv, mode=mode)
+
+        # make sure we inserted the fft trickery
+        topo = f_fft.maker.fgraph.toposort()
+        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
+                   for n in topo) == 2
+
+        res_ref = f_ref()
+        res_fft = f_fft()
+
+        utt.assert_allclose(res_ref, res_fft)
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -12,9 +12,11 @@ from theano.tensor.basic import _allclose, NotScalarConstantError


 class TestConv2D(utt.InferShapeTester):
+    mode = None
+    dtype = 'float64'

    def setUp(self):
-        super (TestConv2D, self).setUp()
+        super(TestConv2D, self).setUp()
        self.input = T.dtensor4('input')
        self.input.name = 'default_V'
        self.filters = T.dtensor4('filters')
@@ -67,11 +69,11 @@ class TestConv2D(utt.InferShapeTester):

        output = sym_conv2d(input, filters)
        output.name = 'conv2d(%s,%s)' % (input.name, filters.name)
-        theano_conv = theano.function([input, filters], output)
+        theano_conv = theano.function([input, filters], output, mode=self.mode)

        # initialize input and compute result
-        image_data = numpy.random.random(N_image_shape)
-        filter_data = numpy.random.random(N_filter_shape)
+        image_data = numpy.random.random(N_image_shape).astype(self.dtype)
+        filter_data = numpy.random.random(N_filter_shape).astype(self.dtype)
        try:
            theano_output = theano_conv(image_data, filter_data)
        except ValueError: