Refactored GpuCorrMM to be split into separate ops for the forward pass and the two backward passes

a725adf3 · f0k · e76a29d9 · a725adf3 · a725adf3 · a725adf3
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -25,7 +25,8 @@ from theano.sandbox.cuda.basic_ops import (
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
+        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -1354,19 +1355,23 @@ def local_conv_gemm(node):
        border_mode = node.op.border_mode
        subsample = node.op.subsample
        pad = (0,0)
-        if (border_mode == 'full') and ((subsample != (1,1)) or (pad != (0,0))):
+        if (border_mode == 'full') and (subsample != (1,1)):
            # need to simulate this via a padded valid convolution
            pad = 'auto'
            border_mode = 'valid'
        if (border_mode == 'valid'):
            # need to flip the kernel for valid convolution
-            kern = gpu_contiguous(kern[:, :, ::-1, ::-1])
+            kern = kern[:, :, ::-1, ::-1]
+            # call GpuCorrMM
+            # TODO: call GpuCorrMM_gradWeights instead if appropriate
+            return [GpuCorrMM('valid', subsample, pad)(
+                    gpu_contiguous(img), gpu_contiguous(kern))]
        elif (border_mode == 'full'):
-            # need to bring kernel into correct memory layout for full convolution
+            # need to dimshuffle the kernel for full convolution
-            kern = gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)).dimshuffle(1, 0, 2, 3)
+            kern = kern.dimshuffle(1, 0, 2, 3)
-        # need C-contiguous inputs
+            # call GpuCorrMM_gradInputs
-        img = gpu_contiguous(img)
+            return [GpuCorrMM_gradInputs('valid', subsample, pad)(
-        return [GpuCorrMM(border_mode, subsample, pad)(img, kern)]
+                    gpu_contiguous(kern), gpu_contiguous(img))]
 gpu_optimizer.register("conv_gemm", local_conv_gemm)

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -186,7 +186,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
        f = theano.function([i, k], op, mode=theano_mode)
        if cls is not None:
            assert any([isinstance(node.op, cls)
-                        for node in f.maker.fgraph.toposort()]), f.maker.fgraph.toposort()
+                        for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
        gpuval = f(img, kern)
        t2 = time.time()
        for i in range(nb_iter):
@@ -284,7 +284,7 @@ def exec_conv(version, shapes, verbose, random, mode,
                        cls=cls)
            except Exception, e:
                print ver, id, (ishape, kshape, subshape, istride, kstride)
-                print e
+                print "Exception", type(e), e
                pass
            if not ret:
                failed_version.add(ver)
@@ -634,7 +634,7 @@ def test_valid(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version = [-1]
        # Add tests with strided inputs by still square images and filters.
@@ -713,7 +713,7 @@ def test_full(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version = [-1]
    else:
@@ -753,7 +753,7 @@ def test_subsample(conv_gemm=False):
    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.GpuCorrMM
+        cls = cuda.blas.BaseGpuCorrMM
        # dummy version; not used by GpuCorrMM so one version is enough
        version_valid = version_full = [-1]
    else: