Add GpuCorrMM and GpuCorr3dMM to gpuarray backend.

691be8f5 · Gijs van Tulder · 146ef971 · 691be8f5 · 691be8f5 · 691be8f5
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1877,8 +1877,6 @@ def dnn_batch_normalization_test(inputs, gamma, beta, mean, var,
    return result
-@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
 def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv2d,
                            AbstractConv2d_gradWeights,
@@ -1922,8 +1920,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]
-@register_opt2([AbstractConv3d, AbstractConv3d_gradWeights,
-                AbstractConv3d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
 def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    if (not isinstance(op, (AbstractConv3d,
                            AbstractConv3d_gradWeights,
@@ -1967,7 +1963,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    return [rval]
-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d, AbstractConv3d])
 def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
@@ -1979,7 +1974,6 @@ def local_abstractconv_cudnn(node):
        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d_gradWeights, AbstractConv3d_gradWeights])
 def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
@@ -1991,7 +1985,6 @@ def local_abstractconv_gw_cudnn(node):
        return local_abstractconv3d_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
-@register_opt('fast_compile', 'conv_dnn', 'cudnn')
 @local_optimizer([AbstractConv2d_gradInputs, AbstractConv3d_gradInputs])
 def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
--- a/theano/gpuarray/tests/test_abstractconv.py
+++ b/theano/gpuarray/tests/test_abstractconv.py
@@ -7,6 +7,9 @@ import numpy
 from theano.tensor.nnet.tests import test_abstract_conv
 from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
 from ..dnn import dnn_available, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
+from ..blas import (
+    GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs,
+    GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
 from .config import mode_with_gpu, test_ctx_name
 from pygpu import gpuarray
@@ -80,6 +83,72 @@ class TestDnnConv3d(test_abstract_conv.BaseTestConv3d):
                           filter_flip=flip, target_op=GpuDnnConvGradI)
+class TestCorrMMConv2d(test_abstract_conv.BaseTestConv2d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv2d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorrMM,
+                                                  GpuCorrMM_gradWeights,
+                                                  GpuCorrMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorrMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorrMM_gradInputs,
+                           filter_dilation=fd)
+class TestCorrMMConv3d(test_abstract_conv.BaseTestConv3d):
+    @classmethod
+    def setup_class(cls):
+        test_abstract_conv.BaseTestConv3d.setup_class()
+        cls.shared = staticmethod(gpuarray_shared_constructor)
+        cls.mode = mode_with_gpu.excluding('cudnn')
+    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
+        mode = self.mode
+        o = self.get_output_shape(i, f, s, b, fd)
+        self.run_fwd(inputs_shape=i, filters_shape=f,
+                     subsample=s, verify_grad=True, mode=mode,
+                     provide_shape=provide_shape, border_mode=b,
+                     filter_flip=flip, target_op=(GpuCorr3dMM,
+                                                  GpuCorr3dMM_gradWeights,
+                                                  GpuCorr3dMM_gradInputs),
+                     filter_dilation=fd)
+        self.run_gradweight(inputs_shape=i, filters_shape=f,
+                            output_shape=o, subsample=s,
+                            verify_grad=True, mode=mode,
+                            provide_shape=provide_shape, border_mode=b,
+                            filter_flip=flip,
+                            target_op=GpuCorr3dMM_gradWeights,
+                            filter_dilation=fd)
+        self.run_gradinput(inputs_shape=i, filters_shape=f,
+                           output_shape=o, subsample=s,
+                           verify_grad=True, mode=mode,
+                           provide_shape=provide_shape, border_mode=b,
+                           filter_flip=flip,
+                           target_op=GpuCorr3dMM_gradInputs,
+                           filter_dilation=fd)
 class TestDnnConvTypes(test_abstract_conv.TestConvTypes):
    def setUp(self):
        self.input = gpu_ftensor4()

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
+from __future__ import absolute_import, print_function, division
+import unittest
+import numpy
+import theano
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
+from ..type import gpuarray_shared_constructor
+from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
+from .config import mode_with_gpu, mode_without_gpu
+class TestCorrMM(unittest.TestCase):
+    def run_conv_valid(self, inputs_shape, filters_shape,
+                       border_mode='valid',
+                       filter_dilation=(1, 1),
+                       subsample=(1, 1),
+                       verify_grad=False):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+        conv_ref = CorrMM(border_mode=border_mode,
+                          filter_dilation=filter_dilation,
+                          subsample=subsample)(inputs, filters)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        conv = GpuCorrMM(border_mode=border_mode,
+                         filter_dilation=filter_dilation,
+                         subsample=subsample)(inputs, filters)
+        f = theano.function([], conv, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+        if verify_grad:
+            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
+                                      filter_dilation=filter_dilation,
+                                      subsample=subsample),
+                            [inputs_val, filters_val])
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(3, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            subsample=(1, 2))
+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(1, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            border_mode=(3, 2))
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 1]
+        filters_shape = [10, 6, 5, 1]
+        for filter_dilation in [(2, 1), (1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 1]
+        filters_shape = [1, 3, 3, 1]
+        for filter_dilation in [(2, 1), (1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
+                       subsample=(1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        dCdH = gpuarray_shared_constructor(dCdH_val)
+        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
+        if (subsample == (1, 1)):
+            conv_ref = CorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+        else:
+            conv_ref = CorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+    def test_gradweight(self):
+        self.run_gradweight(inputs_shape=(16, 10, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            dCdH_shape=(16, 5, 1, 10),
+                            subsample=(1, 1))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 1),
+                            filters_shape=(10, 6, 4, 1),
+                            dCdH_shape=(16, 8, 4, 10),
+                            subsample=(2, 2))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 1),
+                            filters_shape=(10, 6, 3, 1),
+                            dCdH_shape=(16, 5, 3, 10),
+                            subsample=(3, 3))
+        self.run_gradweight(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 6, 12, 1),
+                            dCdH_shape=(16, 8, 1, 10),
+                            subsample=(2, 1))
+    def run_gradinput(self, inputs_shape, filters_shape,
+                      subsample=(1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
+        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
+        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
+        bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width]))
+        if (subsample == (1, 1)):
+            conv_ref = CorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+        else:
+            conv_ref = CorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+    def test_gradinput(self):
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(2, 2))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(3, 3))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
+                           filters_shape=(10, 6, 12, 1),
+                           subsample=(3, 1))
--- a/theano/gpuarray/tests/test_gemmcorr3d.py
+++ b/theano/gpuarray/tests/test_gemmcorr3d.py
+from __future__ import absolute_import, print_function, division
+import unittest
+import numpy
+import theano
+from theano.tests import unittest_tools as utt
+from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs
+from ..type import gpuarray_shared_constructor
+from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
+from .config import mode_with_gpu, mode_without_gpu
+class TestCorr3dMM(unittest.TestCase):
+    def run_conv_valid(self, inputs_shape, filters_shape,
+                       border_mode='valid',
+                       filter_dilation=(1, 1, 1),
+                       subsample=(1, 1, 1),
+                       verify_grad=False):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+        conv_ref = Corr3dMM(border_mode=border_mode,
+                            filter_dilation=filter_dilation,
+                            subsample=subsample)(inputs, filters)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        conv = GpuCorr3dMM(border_mode=border_mode,
+                           filter_dilation=filter_dilation,
+                           subsample=subsample)(inputs, filters)
+        f = theano.function([], conv, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+        if verify_grad:
+            utt.verify_grad(GpuCorr3dMM(border_mode=border_mode,
+                                        filter_dilation=filter_dilation,
+                                        subsample=subsample),
+                            [inputs_val, filters_val])
+    def test_valid(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(2, 2, 2))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 3, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(3, 2, 1))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            subsample=(1, 2, 3))
+    def test_border_mode(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='valid')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='half')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode='full')
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(0, 0, 0))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(1, 2, 3))
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 15, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            border_mode=(3, 2, 1))
+    def test_filter_dilation(self):
+        inputs_shape = [16, 20, 12, 15, 1]
+        filters_shape = [10, 6, 5, 4, 1]
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full']:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode)
+    def test_verify_gradients(self):
+        # use a small example to check the gradients
+        inputs_shape = [2, 7, 9, 6, 1]
+        filters_shape = [1, 3, 3, 2, 1]
+        for filter_dilation in [(2, 1, 1), (1, 2, 1), (1, 1, 2)]:
+            for border_mode in ['valid', 'half', 'full', (2, 1, 3)]:
+                self.run_conv_valid(inputs_shape=inputs_shape,
+                                    filters_shape=filters_shape,
+                                    filter_dilation=filter_dilation,
+                                    border_mode=border_mode,
+                                    verify_grad=True)
+    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
+                       subsample=(1, 1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+        dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        dCdH = gpuarray_shared_constructor(dCdH_val)
+        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))
+        if (subsample == (1, 1, 1)):
+            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH)
+        else:
+            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
+                inputs, dCdH, shape=shape)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+    def test_gradweight(self):
+        self.run_gradweight(inputs_shape=(16, 10, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            dCdH_shape=(16, 5, 1, 13, 10),
+                            subsample=(1, 1, 1))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
+                            filters_shape=(10, 6, 4, 4, 1),
+                            dCdH_shape=(16, 8, 4, 7, 10),
+                            subsample=(2, 2, 2))
+        self.run_gradweight(inputs_shape=(16, 20, 10, 16, 1),
+                            filters_shape=(10, 6, 3, 4, 1),
+                            dCdH_shape=(16, 5, 3, 5, 10),
+                            subsample=(3, 3, 3))
+        self.run_gradweight(inputs_shape=(16, 20, 12, 16, 1),
+                            filters_shape=(10, 6, 12, 4, 1),
+                            dCdH_shape=(16, 8, 1, 5, 10),
+                            subsample=(2, 1, 3))
+    def run_gradinput(self, inputs_shape, filters_shape,
+                      subsample=(1, 1, 1)):
+        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
+        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs = gpuarray_shared_constructor(inputs_val)
+        filters = gpuarray_shared_constructor(filters_val)
+        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
+        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
+        bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
+        bottom_shape = gpuarray_shared_constructor(numpy.array([bottom_height, bottom_width, bottom_depth]))
+        if (subsample == (1, 1, 1)):
+            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs)
+        else:
+            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
+                kern=filters, topgrad=inputs, shape=bottom_shape)
+        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
+        f = theano.function([], conv_gemm, mode=mode_with_gpu)
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+    def test_gradinput(self):
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(2, 2, 2))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(3, 3, 3))
+        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
+                           filters_shape=(10, 6, 12, 4, 1),
+                           subsample=(3, 1, 2))