Unshared convolution python code

Errors fixed. Suggestions implemented. Minor changes Minor changes Gradient calculation added. Test for forward added. Errors fixed Grad wrt weights done. Grad wrt inputs incomplete Grad inp Fix typo Tests and bug fix for Grad Inp Modified flops to raise error Mostly convdim agnostic. Cleaner code for gradInput Some corr changes MiChecks for convdim=2 added. Some more misc changes Unshared code moved into one func Re-added unshared flag to get_conv_output_shape Simpler grad inputs. Unshared removeded from get_conv_output_shape. C code changes in corr.py wdim bug fix opt and abstract_conv changes CPU code for fwd and gradWeights. Added tests. Some errors gemv increment fixed. Values for fwd still don't match Forward perfect. Gradweights inverts regions; to be corrected. Added grad inputs and tests but allclose error Python gradInputs simplified Grad input fixed gradweights flipping problem solved Weight dimension order changed. C cache version updated. Docstring changes if unshared is True -> if unshared. Specific error messages for unshared in C code. Unshared tests integrated with AbstractConv. Subsampling errors fixed. Allclose errors with optimiser enabled Kern flip in optimiser fixed. Still some errors Errors fixed GPU corr_gemm code (untested) Unnecessary changes rolled back More GPU code but gemm error 11 Fixed mistakes caused while copying from CPU Errors fixed Fixed error with .data for gpuarray GPU tests Suggestions implemented for error messages Jenkins errors fixed Commits squashed Small errors fixed. Tests need to be rewritten Tests moved to separate class. Mistakes fixed Tests sped up Suggestions implemented. Tests modified

Unshared convolution python code
58c3d08e · Vikram · 078bdfb1 · 58c3d08e · 58c3d08e · 58c3d08e
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
        return None

+    if op.unshared:
+        return None
+
    inp1 = inputs[0]
    inp2 = inputs[1]

@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d):
@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
+    if node.op.unshared:
+        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradWeights):
@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradInputs):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
+    flip = (slice(None),) * (kern.ndim - 2) + \
+        (slice(None, None, -1),) * 2
+    kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
+    if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
        if not node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # need to dimshuffle the kernel for full convolution
-        kern = kern.dimshuffle(1, 0, 2, 3)
+        kern = kern.dimshuffle(kern_axes)
        # call GpuCorrMM_gradInputs
        rval = GpuCorrMM_gradInputs('valid',
                                    subsample,
@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation,
-                         node.op.num_groups)(gpu_contiguous(img),
-                                             gpu_contiguous(kern))
+                         num_groups,
+                         unshared)(gpu_contiguous(img),
+                                   gpu_contiguous(kern))

        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
                (node.op.kshp is not None) and
                (None not in node.op.kshp) and
                border_mode != "half" and
-                node.op.num_groups == 1):
+                num_groups == 1 and
+                not unshared):
            # we know the kernel and output size
-            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod1 = node.op.kshp[0] * node.op.kshp[-3]
            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                     (node.op.imshp[-1] - node.op.kshp[-3] + 1))
            if (None not in node.op.imshp[:1]):
                # we also know batchsize and input channels
                prod1 *= node.op.imshp[0]
@@ -1641,7 +1648,8 @@ def local_abstractconv_gemm(node):
            if prod1 > prod2:
                rval = GpuCorrMM_gradWeights(border_mode,
                                             subsample,
-                                             filter_dilation)(
+                                             filter_dilation,
+                                             unshared)(
                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
                # (we need to wrap the result in as_gpuarray_variable,
@@ -1690,8 +1698,9 @@ def local_abstractconv_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

@@ -1702,7 +1711,7 @@ def local_abstractconv_gemm_alt(node):
            gpu_contiguous(kern), gpu_contiguous(img))

    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-          num_groups == 1):
+          num_groups == 1 and not unshared):
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

@@ -1896,10 +1905,13 @@ def local_abstractconv_gradweights_gemm(node):
    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
                                 filter_dilation=node.op.filter_dilation,
-                                 num_groups=node.op.num_groups)(
+                                 num_groups=node.op.num_groups,
+                                 unshared=node.op.unshared)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    flip = (slice(None),) * (rval.ndim - 2) + \
+        (slice(None, None, -1),) * 2
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        rval = rval[flip]
    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    rval = as_gpuarray_variable(rval, context_name=ctx)
    return [rval]
@@ -1918,9 +1930,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-       num_groups == 1):
+       num_groups == 1 and not unshared):
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation)(
@@ -2001,12 +2014,15 @@ def local_abstractconv_gradinputs_gemm(node):
        return None

    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]

    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
                                filter_dilation=node.op.filter_dilation,
-                                num_groups=node.op.num_groups)(
+                                num_groups=node.op.num_groups,
+                                unshared=node.op.unshared)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]

@@ -2023,8 +2039,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]


--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -8,10 +8,10 @@ from theano.tests import unittest_tools as utt

 from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs

-from ..type import gpuarray_shared_constructor
-from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu, ref_cast
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.gpuarray.type import gpuarray_shared_constructor
+from theano.gpuarray.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
+from config import mode_with_gpu, mode_without_gpu, ref_cast
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv


 class TestCorrMM(unittest.TestCase):
@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
                       border_mode='valid',
                       filter_dilation=(1, 1),
                       subsample=(1, 1),
+                       unshared=False,
                       verify_grad=False):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        if unshared:
+            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
+        else:
+            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]

        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)
@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):

        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(ref_cast(inputs),
-                                               ref_cast(filters))
+                          subsample=subsample,
+                          unshared=unshared)(ref_cast(inputs),
+                                             ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)

        conv = GpuCorrMM(border_mode=border_mode,
                         filter_dilation=filter_dilation,
-                         subsample=subsample)(inputs, filters)
+                         subsample=subsample,
+                         unshared=unshared)(inputs, filters)
        f = theano.function([], conv, mode=mode_with_gpu)

        res_ref = f_ref()
@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
                                      filter_dilation=filter_dilation,
-                                      subsample=subsample),
+                                      subsample=subsample,
+                                      unshared=unshared),
                            [inputs_val, filters_val], mode=mode_with_gpu)

    def test_valid(self):
@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(3, 3))
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(3, 3))
@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
                                    border_mode=border_mode,
                                    verify_grad=True)

+    def test_unshared(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 8, 1, 6, 12, 1),
+                            subsample=(2, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 3), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            subsample=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode='valid', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 13, 6, 12, 1),
+                            border_mode='half', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 25, 23, 6, 12, 1),
+                            border_mode='full', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode=(0, 0), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 17, 5, 6, 12, 1),
+                            border_mode=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 5, 6, 12, 1),
+                            border_mode=(3, 2), unshared=True)
+
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
@@ -227,3 +263,17 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
    conv_op = GpuCorrMM
    conv_gradw_op = GpuCorrMM_gradWeights
    conv_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+
+
+class TestUnsharedGpuCorr2d(TestUnsharedConv):
+    mode = theano.compile.get_mode("FAST_RUN")
+    conv2d = GpuCorrMM
+    conv2d_gradw = GpuCorrMM_gradWeights
+    conv2d_gradi = GpuCorrMM_gradInputs
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d

 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        (batch size, input channels, input rows, input columns).
        See the optional parameter ``input_shape``.

-    filters: symbolic 4D tensor
+    filters: symbolic 4D or 6D tensor
        Set of filters used in CNN layer of shape
-        (output channels, input channels, filter rows, filter columns).
+        (output channels, input channels, filter rows, filter columns)
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
        See the optional parameter ``filter_shape``.

-    input_shape: None, tuple/list of len 4 of int or Constant variable
+    input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the input parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.

-    filter_shape: None, tuple/list of len 4 of int or Constant variable
+    filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately

+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
+
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.

@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,

    return abstract_conv2d(input, filters, input_shape, filter_shape,
                           border_mode, subsample, filter_flip,
-                           filter_dilation, num_groups)
+                           filter_dilation, num_groups, unshared)


 def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                     border_mode='valid', input_dilation=(1, 1),
-                     filter_flip=True, filter_dilation=(1, 1), num_groups=1):
+                     filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
    """
    This function will build the symbolic graph for applying a transposed
    convolution over a mini-batch of a stack of 2D inputs with a set of 2D
@@ -215,6 +224,11 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately

+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
+
    Returns
    -------
    Symbolic 4D tensor
@@ -242,4 +256,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                                  subsample=input_dilation,
                                  filter_flip=filter_flip,
                                  filter_dilation=filter_dilation,
-                                  num_groups=num_groups)
+                                  num_groups=num_groups,
+                                  unshared=unshared)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
--- a/theano/tensor/nnet/c_code/corr_gemm.c
+++ b/theano/tensor/nnet/c_code/corr_gemm.c
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM(border_mode=node.op.border_mode,
                  subsample=node.op.subsample,
                  filter_dilation=node.op.filter_dilation,
-                  num_groups=node.op.num_groups)(img, kern)
-
+                  num_groups=node.op.num_groups,
+                  unshared=node.op.unshared)(img, kern)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
                              subsample=node.op.subsample,
                              filter_dilation=node.op.filter_dilation,
-                              num_groups=node.op.num_groups)(img, topgrad, shape)
+                              num_groups=node.op.num_groups,
+                              unshared=node.op.unshared)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (rval.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        rval = rval[flip]
    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    copy_stack_trace(node.outputs[0], rval)

@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
                             subsample=node.op.subsample,
                             filter_dilation=node.op.filter_dilation,
-                             num_groups=node.op.num_groups)(kern, topgrad,
-                                                            shape)
+                             num_groups=node.op.num_groups,
+                             unshared=node.op.unshared)(kern, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    rval = conv2d(img, kern,
@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    if node.op.border_mode == 'valid' and \
@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    # Conv 3d implementation, needed when subsample > 2

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -1744,3 +1744,146 @@ class Separable_conv(unittest.TestCase):
        fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
        top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
        utt.assert_allclose(top, precomp_output)
+
+
+class TestUnsharedConv(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+
+    def setUp(self):
+        self.mode = theano.compile.mode.Mode(optimizer='None')
+
+        self.img_shape = [(2, 1, 4, 4), (1, 2, 4, 2), (1, 3, 5, 3), (1, 4, 4, 4)]
+        self.kern_shape = [(2, 2, 2, 1, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
+        self.topgrad_shape = [(2, 2, 2, 2), (1, 2, 4, 2), (1, 3, 2, 1), (1, 4, 3, 3)]
+        self.border_mode = ['valid', 'full', 'valid', 'full']
+        self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
+        self.filter_dilation = (1, 1)
+        self.num_groups = [1, 1, 3, 2]
+
+        self.verify_flags = np.random.choice([True, False], 4, [1.0, 0.0])
+
+        self.ref_mode = 'FAST_RUN'
+        if theano.config.cxx == "":
+            raise SkipTest("CorrMM needs cxx")
+
+    def test_fwd(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = tensor6('kern')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+
+        for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
+                                                          self.subsample, self.num_groups, self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                           filter_dilation=self.filter_dilation,
+                                           num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
+            unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, kern)
+
+            single_kshp = kshp[:1] + kshp[3:]
+
+            ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                      filter_dilation=self.filter_dilation,
+                                      num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
+            ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
+
+            for i in range(0, kshp[1]):
+                for j in range(0, kshp[2]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    ref_val = ref_func(img, single_kern)
+                    utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
+
+            if verify:
+                utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
+
+    def test_gradweight(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
+            unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, top)
+
+            single_kshp = kshp[:1] + kshp[3:]
+
+            ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
+            ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
+
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output = ref_func(img, top_single)
+                    utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
+
+            def conv_gradweight(inputs_val, output_val):
+                return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
+
+            if verify:
+                utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
+
+    def test_gradinput(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        kern_sym = tensor6('kern')
+        top_sym = theano.tensor.tensor4('top')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            single_kshp = kshp[:1] + kshp[3:]
+            single_kern = np.random.random(single_kshp).astype(theano.config.floatX)
+            kern = single_kern.reshape((kshp[:1] + (1, 1) + kshp[3:]))
+            kern = np.tile(kern, (1, kshp[1], kshp[2], 1, 1, 1))
+
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(kern, top)
+
+            ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
+            ref_output = ref_func(single_kern, top)
+
+            utt.assert_allclose(ref_output, unshared_output)
+
+            def conv_gradinputs(filters_val, output_val):
+                return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
+
+            if verify:
+                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,7 +10,7 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv


 class TestCorr2D(utt.InferShapeTester):
@@ -452,6 +452,21 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
        utt.assert_allclose(gconv_output, conv_output)


+class TestUnsharedCorr2D(TestUnsharedConv):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d = corr.CorrMM
+    conv2d_gradw = corr.CorrMM_gradWeights
+    conv2d_gradi = corr.CorrMM_gradInputs
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+
+
 if __name__ == '__main__':

    t = TestCorr2D('setUp')