Merge pull request #6286 from vikramnitin9/grouped_unshared

Implement Unshared Convolution

Merge pull request #6286 from vikramnitin9/grouped_unshared
9592125c · Frédéric Bastien · GitHub · c32b0db8 · 121f96d6 · 9592125c
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
        return None
+    if op.unshared:
+        return None
    inp1 = inputs[0]
    inp2 = inputs[1]
@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d):
@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):
    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
+    if node.op.unshared:
+        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]
@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradWeights):
@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradInputs):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
+    flip = (slice(None),) * (kern.ndim - 2) + \
+        (slice(None, None, -1),) * 2
+    kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
+    if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
        if not node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # need to dimshuffle the kernel for full convolution
-        kern = kern.dimshuffle(1, 0, 2, 3)
+        kern = kern.dimshuffle(kern_axes)
        # call GpuCorrMM_gradInputs
        rval = GpuCorrMM_gradInputs('valid',
                                    subsample,
@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation,
-                         node.op.num_groups)(gpu_contiguous(img),
+                         num_groups,
-                                             gpu_contiguous(kern))
+                         unshared)(gpu_contiguous(img),
+                                   gpu_contiguous(kern))
        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
                (node.op.kshp is not None) and
                (None not in node.op.kshp) and
                border_mode != "half" and
-                node.op.num_groups == 1):
+                num_groups == 1 and
+                not unshared):
            # we know the kernel and output size
-            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod1 = node.op.kshp[0] * node.op.kshp[-3]
            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                     (node.op.imshp[-1] - node.op.kshp[-3] + 1))
            if (None not in node.op.imshp[:1]):
                # we also know batchsize and input channels
                prod1 *= node.op.imshp[0]
@@ -1666,13 +1673,19 @@ def local_abstractconv_gemm_def(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = GpuCorrMM(border_mode,
                     subsample,
                     filter_dilation,
-                     node.op.num_groups)(gpu_contiguous(img),
+                     num_groups,
-                                         gpu_contiguous(kern))
+                     unshared)(gpu_contiguous(img),
+                               gpu_contiguous(kern))
    return [rval]
@@ -1690,8 +1703,9 @@ def local_abstractconv_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
@@ -1702,7 +1716,7 @@ def local_abstractconv_gemm_alt(node):
            gpu_contiguous(kern), gpu_contiguous(img))
    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-          num_groups == 1):
+          num_groups == 1 and not unshared):
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
@@ -1896,10 +1910,13 @@ def local_abstractconv_gradweights_gemm(node):
    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
                                 filter_dilation=node.op.filter_dilation,
-                                 num_groups=node.op.num_groups)(
+                                 num_groups=node.op.num_groups,
+                                 unshared=node.op.unshared)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    flip = (slice(None),) * (rval.ndim - 2) + \
+        (slice(None, None, -1),) * 2
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        rval = rval[flip]
    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    rval = as_gpuarray_variable(rval, context_name=ctx)
    return [rval]
@@ -1918,9 +1935,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-       num_groups == 1):
+       num_groups == 1 and not unshared):
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation)(
@@ -2001,12 +2019,15 @@ def local_abstractconv_gradinputs_gemm(node):
        return None
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
                                filter_dilation=node.op.filter_dilation,
-                                num_groups=node.op.num_groups)(
+                                num_groups=node.op.num_groups,
+                                unshared=node.op.unshared)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]
@@ -2023,8 +2044,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
@@ -2117,8 +2139,9 @@ class ConvMetaOptimizer(LocalMetaOptimizer):
                                         node.op.border_mode,
                                         node.op.subsample,
                                         node.op.filter_dilation)
+            convdim = img.ndim - 2
-            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[2:])
+            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[-convdim:])
            for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -11,7 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput
 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from .config import mode_with_gpu, mode_without_gpu, ref_cast
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
 class TestCorrMM(unittest.TestCase):
@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
                       border_mode='valid',
                       filter_dilation=(1, 1),
                       subsample=(1, 1),
+                       unshared=False,
                       verify_grad=False):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        if unshared:
+            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
+        else:
+            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)
@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):
        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(ref_cast(inputs),
+                          subsample=subsample,
-                                               ref_cast(filters))
+                          unshared=unshared)(ref_cast(inputs),
+                                             ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        conv = GpuCorrMM(border_mode=border_mode,
                         filter_dilation=filter_dilation,
-                         subsample=subsample)(inputs, filters)
+                         subsample=subsample,
+                         unshared=unshared)(inputs, filters)
        f = theano.function([], conv, mode=mode_with_gpu)
        res_ref = f_ref()
@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
                                      filter_dilation=filter_dilation,
-                                      subsample=subsample),
+                                      subsample=subsample,
+                                      unshared=unshared),
                            [inputs_val, filters_val], mode=mode_with_gpu)
    def test_valid(self):
@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(3, 3))
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(3, 3))
@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
                                    border_mode=border_mode,
                                    verify_grad=True)
+    def test_unshared(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 8, 1, 6, 12, 1),
+                            subsample=(2, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 3), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            subsample=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode='valid', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 13, 6, 12, 1),
+                            border_mode='half', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 25, 23, 6, 12, 1),
+                            border_mode='full', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode=(0, 0), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 17, 5, 6, 12, 1),
+                            border_mode=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 5, 6, 12, 1),
+                            border_mode=(3, 2), unshared=True)
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
@@ -227,3 +263,12 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
    conv_op = GpuCorrMM
    conv_gradw_op = GpuCorrMM_gradWeights
    conv_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+class TestUnsharedGpuCorr2d(TestUnsharedConv):
+    mode = mode_with_gpu
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d
 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        (batch size, input channels, input rows, input columns).
        See the optional parameter ``input_shape``.
-    filters: symbolic 4D tensor
+    filters: symbolic 4D or 6D tensor
        Set of filters used in CNN layer of shape
-        (output channels, input channels, filter rows, filter columns).
+        (output channels, input channels, filter rows, filter columns)
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
        See the optional parameter ``filter_shape``.
-    input_shape: None, tuple/list of len 4 of int or Constant variable
+    input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the input parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.
-    filter_shape: None, tuple/list of len 4 of int or Constant variable
+    filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.
@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
    return abstract_conv2d(input, filters, input_shape, filter_shape,
                           border_mode, subsample, filter_flip,
-                           filter_dilation, num_groups)
+                           filter_dilation, num_groups, unshared)
 def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                     border_mode='valid', input_dilation=(1, 1),
-                     filter_flip=True, filter_dilation=(1, 1), num_groups=1):
+                     filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
    """
    This function will build the symbolic graph for applying a transposed
    convolution over a mini-batch of a stack of 2D inputs with a set of 2D
@@ -215,6 +224,12 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
+        Grouped unshared convolution is supported.
    Returns
    -------
    Symbolic 4D tensor
@@ -242,4 +257,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                                  subsample=input_dilation,
                                  filter_flip=filter_flip,
                                  filter_dilation=filter_dilation,
-                                  num_groups=num_groups)
+                                  num_groups=num_groups,
+                                  unshared=unshared)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/c_code/corr_gemm.c
+++ b/theano/tensor/nnet/c_code/corr_gemm.c
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM(border_mode=node.op.border_mode,
                  subsample=node.op.subsample,
                  filter_dilation=node.op.filter_dilation,
-                  num_groups=node.op.num_groups)(img, kern)
+                  num_groups=node.op.num_groups,
+                  unshared=node.op.unshared)(img, kern)
    copy_stack_trace(node.outputs[0], rval)
    return [rval]
@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
                              subsample=node.op.subsample,
                              filter_dilation=node.op.filter_dilation,
-                              num_groups=node.op.num_groups)(img, topgrad, shape)
+                              num_groups=node.op.num_groups,
+                              unshared=node.op.unshared)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (rval.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        rval = rval[flip]
    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    copy_stack_trace(node.outputs[0], rval)
@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
                             subsample=node.op.subsample,
                             filter_dilation=node.op.filter_dilation,
-                             num_groups=node.op.num_groups)(kern, topgrad,
+                             num_groups=node.op.num_groups,
-                                                            shape)
+                             unshared=node.op.unshared)(kern, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)
    return [rval]
@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    rval = conv2d(img, kern,
@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    if node.op.border_mode == 'valid' and \
@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    # Conv 3d implementation, needed when subsample > 2

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -1744,3 +1744,154 @@ class Separable_conv(unittest.TestCase):
        fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
        top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
        utt.assert_allclose(top, precomp_output)
+class TestUnsharedConv(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    mode = theano.compile.mode.Mode(optimizer='None')
+    def setUp(self):
+        self.img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3), (3, 4, 4, 4)]
+        self.kern_shape = [(2, 2, 2, 2, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
+        self.topgrad_shape = [(2, 2, 2, 2), (3, 2, 4, 2), (3, 3, 2, 1), (3, 4, 3, 3)]
+        self.border_mode = ['valid', 'full', 'valid', 'full']
+        self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
+        self.filter_dilation = (1, 1)
+        self.num_groups = [1, 1, 3, 2]
+        # self.verify_flags = np.random.choice([True, False], 4, [0.5, 0.5])
+        # Above line can be used instead if speed is a concern
+        self.verify_flags = [True] * 4
+        self.ref_mode = 'FAST_RUN'
+        if theano.config.cxx == "":
+            raise SkipTest("CorrMM needs cxx")
+    def test_fwd(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = tensor6('kern')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+        for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
+                                                          self.subsample, self.num_groups, self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                           filter_dilation=self.filter_dilation,
+                                           num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
+            unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, kern)
+            single_kshp = kshp[:1] + kshp[3:]
+            ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                      filter_dilation=self.filter_dilation,
+                                      num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
+            ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
+            for i in range(0, kshp[1]):
+                for j in range(0, kshp[2]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    ref_val = ref_func(img, single_kern)
+                    utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
+            if verify:
+                utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
+    def test_gradweight(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
+            unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, top)
+            single_kshp = kshp[:1] + kshp[3:]
+            ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
+            ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output = ref_func(img, top_single)
+                    utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
+            def conv_gradweight(inputs_val, output_val):
+                return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
+            if verify:
+                utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
+    def test_gradinput(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        kern_sym = tensor6('kern')
+        top_sym = theano.tensor.tensor4('top')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            single_kshp = kshp[:1] + kshp[3:]
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(kern, top)
+            ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
+            ref_output = np.zeros(imshp)
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output += ref_func(single_kern, top_single)
+            utt.assert_allclose(ref_output, unshared_output)
+            def conv_gradinputs(filters_val, output_val):
+                return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
+            if verify:
+                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,7 +10,7 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
 class TestCorr2D(utt.InferShapeTester):
@@ -452,6 +452,16 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
        utt.assert_allclose(gconv_output, conv_output)
+class TestUnsharedCorr2d(TestUnsharedConv):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
 if __name__ == '__main__':
    t = TestCorr2D('setUp')