Unshared convolution python code

Errors fixed. Suggestions implemented. Minor changes Minor changes Gradient calculation added. Test for forward added. Errors fixed Grad wrt weights done. Grad wrt inputs incomplete Grad inp Fix typo Tests and bug fix for Grad Inp Modified flops to raise error Mostly convdim agnostic. Cleaner code for gradInput Some corr changes MiChecks for convdim=2 added. Some more misc changes Unshared code moved into one func Re-added unshared flag to get_conv_output_shape Simpler grad inputs. Unshared removeded from get_conv_output_shape. C code changes in corr.py wdim bug fix opt and abstract_conv changes CPU code for fwd and gradWeights. Added tests. Some errors gemv increment fixed. Values for fwd still don't match Forward perfect. Gradweights inverts regions; to be corrected. Added grad inputs and tests but allclose error Python gradInputs simplified Grad input fixed gradweights flipping problem solved Weight dimension order changed. C cache version updated. Docstring changes if unshared is True -> if unshared. Specific error messages for unshared in C code. Unshared tests integrated with AbstractConv. Subsampling errors fixed. Allclose errors with optimiser enabled Kern flip in optimiser fixed. Still some errors Errors fixed GPU corr_gemm code (untested) Unnecessary changes rolled back More GPU code but gemm error 11 Fixed mistakes caused while copying from CPU Errors fixed Fixed error with .data for gpuarray GPU tests Suggestions implemented for error messages Jenkins errors fixed Commits squashed Small errors fixed. Tests need to be rewritten Tests moved to separate class. Mistakes fixed Tests sped up Suggestions implemented. Tests modified

Unshared convolution python code
58c3d08e · Vikram · 078bdfb1 · 58c3d08e · 58c3d08e · 58c3d08e
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -458,13 +458,15 @@ class BaseGpuCorrMM(CGpuKernelBase):
    num_groups :
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately (default : 1).
+    unshared
+        Perform unshared correlation (default: False)
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
    _f16_ok = True
    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
        if isinstance(border_mode, tuple):
@@ -487,6 +489,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
            raise ValueError("Number of groups should be greater than 0")
        self.num_groups = num_groups
        CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c'])
+        self.unshared = unshared
    @property
    def pad(self):
@@ -495,12 +498,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
        return (0, 0)
    def __str__(self):
-        return '%s{%s, %s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
-            str(self.num_groups))
+            str(self.num_groups),
+            str(self.unshared))
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -533,7 +537,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (10,)
+        return (11,)
    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -581,6 +585,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        dH, dW = self.subsample
        dilH, dilW = self.filter_dilation
        numgroups = self.num_groups
+        unshared = int(self.unshared)
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -633,19 +638,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
    int padH = %(padH)s;
    int padW = %(padW)s;
    int numgroups = %(numgroups)s;
+    int unshared = %(unshared)s;
    PyGpuArrayObject * bottom = %(bottom)s;
    PyGpuArrayObject * weights = %(weights)s;
    PyGpuArrayObject * top = %(top)s;
    PyGpuArrayObject * out2 = NULL;
+    int wdim, odim;
+    wdim = unshared ? 6 : 4;
+    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    size_t kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
-        kH = PyGpuArray_DIMS(weights)[2];
+        kH = PyGpuArray_DIMS(weights)[wdim-2];
-        kW = PyGpuArray_DIMS(weights)[3];
+        kW = PyGpuArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
@@ -699,8 +709,10 @@ class BaseGpuCorrMM(CGpuKernelBase):
    // Infer output shape and type
    // The inferred shape can be negative.
-    long long out_dim[4];
+    long long out_dim[6];
-    size_t out_dim_size[4];
+    size_t out_dim_size[6];
+    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
+    out_dim_size[4] = out_dim_size[5] = 0; //Same
    int out_typecode;
    PyGpuContextObject *out_context;
    switch(direction) {
@@ -709,71 +721,131 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
-            PyErr_Format(PyExc_ValueError,
+            if (unshared) {
-                         "GpuCorrMM: impossible output shape\\n"
+                PyErr_Format(PyExc_ValueError,
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "GpuCorrMM: impossible output shape\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-            %(fail)s
+                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                %(fail)s
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                %(fail)s
+            }
        }
        break;
    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width)
+        // output is weights: (num_filters, num_channels, height, width) or
+        // (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
+        if (unshared){
-        out_dim[2] = kH;  // already inferred further above
+            odim = 6;
-        out_dim[3] = kW;  // how convenient
+            out_dim[1] = PyGpuArray_DIMS(top)[2];
+            out_dim[2] = PyGpuArray_DIMS(top)[3];
+        }
+        out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
+        out_dim[wdim-2] = kH;  // already inferred further above
+        out_dim[wdim-1] = kW;  // how convenient
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        if (unshared) {
-        {
+            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
-            PyErr_Format(PyExc_ValueError,
+                    || out_dim[4] <= 0 || out_dim[5] <= 0){
-                         "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                PyErr_Format(PyExc_ValueError,
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                             out_dim[4], out_dim[5],
-            %(fail)s
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+             if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
+        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        if (unshared) {
-        {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            PyErr_Format(PyExc_ValueError,
+            {
-                         "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                PyErr_Format(PyExc_ValueError,
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
-                         "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             "  weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
-            %(fail)s
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    default:
@@ -786,12 +858,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
    out_dim_size[2] = (size_t)out_dim[2];
    out_dim_size[3] = (size_t)out_dim[3];
+    if (odim == 6) {
+        out_dim_size[4] = (size_t)out_dim[4];
+        out_dim_size[5] = (size_t)out_dim[5];
+    }
    // Prepare output array
-    if (theano_prep_output(&%(out)s, 4, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
+    if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
    {
-        PyErr_Format(PyExc_RuntimeError,
+        if (odim == 4) {
-                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+            PyErr_Format(PyExc_RuntimeError,
-                out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+        }
+        if (odim == 6) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
+        }
        %(fail)s
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
@@ -800,7 +884,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }
    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -859,9 +943,9 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        super(GpuCorrMM, self).__init__(border_mode, subsample,
-                                        filter_dilation, num_groups)
+                                        filter_dilation, num_groups, unshared)
    def make_node(self, img, kern):
        ctx_name = infer_context_name(img, kern)
@@ -869,8 +953,12 @@ class GpuCorrMM(BaseGpuCorrMM):
        kern = as_gpuarray_variable(kern, ctx_name)
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
+        if self.unshared:
-            raise TypeError('kern must be 4D tensor')
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
@@ -891,12 +979,14 @@ class GpuCorrMM(BaseGpuCorrMM):
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
                                        self.filter_dilation,
-                                        self.num_groups)(
+                                        self.num_groups,
+                                        self.unshared)(
            weights, top, bottom.shape[-2:])
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
                                          self.filter_dilation,
-                                          self.num_groups)(
+                                          self.num_groups,
+                                          self.unshared)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights
@@ -915,10 +1005,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
                                                    subsample,
-                                                    filter_dilation, num_groups)
+                                                    filter_dilation, num_groups,
+                                                    unshared)
    def make_node(self, img, topgrad, shape=None):
        ctx_name = infer_context_name(img, topgrad)
@@ -938,8 +1030,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+        if self.unshared:
-                         False, False]
+            broadcastable = [topgrad.type.broadcastable[0], False, False,
+                             img.type.broadcastable[1], False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                             False, False]
        return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
                                                                        context_name=ctx_name,
                                                                        broadcastable=broadcastable)()])
@@ -958,11 +1054,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
                                        self.filter_dilation,
-                                        self.num_groups)(weights,
+                                        self.num_groups,
-                                                         top,
+                                        self.unshared)(weights,
-                                                         bottom.shape[-2:])
+                                                       top,
+                                                       bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -989,16 +1086,22 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
-                                                   filter_dilation, num_groups)
+                                                   filter_dilation, num_groups,
+                                                   unshared)
    def make_node(self, kern, topgrad, shape=None):
        ctx_name = infer_context_name(kern, topgrad)
        kern = as_gpuarray_variable(kern, ctx_name)
        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if kern.type.ndim != 4:
+        if self.unshared:
-            raise TypeError('kern must be 4D tensor')
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
        if shape is None:
@@ -1014,7 +1117,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
                             False, False]
        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
                                                                         context_name=ctx_name,
@@ -1034,13 +1137,15 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
                                          self.filter_dilation,
-                                          self.num_groups)(bottom,
+                                          self.num_groups,
-                                                           top,
+                                          self.unshared)(bottom,
-                                                           weights.shape[-2:])
+                                                         top,
+                                                         weights.shape[-2:])
        d_top = GpuCorrMM(self.border_mode,
                          self.subsample,
                          self.filter_dilation,
-                          self.num_groups)(bottom, weights)
+                          self.num_groups,
+                          self.unshared)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1682,7 +1787,7 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
                             False, False, False]
        return Apply(self, [kern, topgrad] + height_width_depth,
                     [GpuArrayType(dtype=topgrad.dtype,

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
@@ -349,7 +349,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dilW = 1,
                         const size_t padH = 0,
                         const size_t padW = 0,
-                         const size_t numgroups = 1)
+                         const size_t numgroups = 1,
+                         const size_t unshared = 0)
 {
    if (PyGpuArray_NDIM(bottom) != 4)
    {
@@ -368,21 +369,35 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        return NULL;
    }
-    if (PyGpuArray_NDIM(weight) != 4)
+    if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
+        PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
-        PyErr_Format(PyExc_ValueError,
+        if (unshared) {
-                "GpuCorrMM requires weight to be C-contiguous, "
+            PyErr_Format(PyExc_ValueError,
-                "but strides are: %ld %ld %ld %ld\n",
+                    "GpuCorrMM requires weight to be C-contiguous, "
-                PyGpuArray_STRIDES(weight)[0],
+                    "but strides are: %ld %ld %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(weight)[1],
+                    PyGpuArray_STRIDES(weight)[0],
-                PyGpuArray_STRIDES(weight)[2],
+                    PyGpuArray_STRIDES(weight)[1],
-                PyGpuArray_STRIDES(weight)[3]);
+                    PyGpuArray_STRIDES(weight)[2],
-        return NULL;
+                    PyGpuArray_STRIDES(weight)[3],
+                    PyGpuArray_STRIDES(weight)[4],
+                    PyGpuArray_STRIDES(weight)[5]);
+            return NULL;
+        }
+        else {  
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM requires weight to be C-contiguous, "
+                    "but strides are: %ld %ld %ld %ld\n",
+                    PyGpuArray_STRIDES(weight)[0],
+                    PyGpuArray_STRIDES(weight)[1],
+                    PyGpuArray_STRIDES(weight)[2],
+                    PyGpuArray_STRIDES(weight)[3]);
+            return NULL;
+        }
    }
    if (PyGpuArray_NDIM(top) != 4)
@@ -409,10 +424,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    // weights: (nFilters, nChannels, rows, columns)
+    // or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
-    const size_t kH = PyGpuArray_DIMS(weight)[2];
-    const size_t kW = PyGpuArray_DIMS(weight)[3];
+    const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
-    if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
+    const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
+    if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -435,21 +452,56 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
 #undef _CONV_FLOORDIV
-    if (batchSize != PyGpuArray_DIMS(top)[0] ||
+    if (unshared) {
-            nFilters != PyGpuArray_DIMS(top)[1] ||
+        if (topHeight != PyGpuArray_DIMS(weight)[1] ||
-            topHeight != PyGpuArray_DIMS(top)[2] ||
+                topWidth != PyGpuArray_DIMS(weight)[2]) {
-            topWidth != PyGpuArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
-        PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM regions in kernel must match output regions:\n"
-                "GpuCorrMM shape inconsistency:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
-                "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld %ld %ld"
-                "  weight shape: %ld %ld %ld %ld\n"
+                    " (expected %ld %ld %ld %ld %ld %ld)\n"
-                "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                    "  top shape(calculated): %ld %ld %ld %ld\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
+                    batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels / numgroups, kH, kW,
+                    nFilters, PyGpuArray_DIMS(weight)[1],
-                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                    PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
-                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                batchSize, nFilters, topHeight, topWidth);
+                    batchSize, nFilters, topHeight, topWidth);
-        return NULL;
+            return NULL;
+        }
+        if (batchSize != PyGpuArray_DIMS(top)[0] ||
+                nFilters != PyGpuArray_DIMS(top)[1] ||
+                topHeight != PyGpuArray_DIMS(top)[2] ||
+                topWidth != PyGpuArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM shape inconsistency:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld %ld %ld\n"
+                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+    }
+    else{
+        if (batchSize != PyGpuArray_DIMS(top)[0] ||
+                nFilters != PyGpuArray_DIMS(top)[1] ||
+                topHeight != PyGpuArray_DIMS(top)[2] ||
+                topWidth != PyGpuArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM shape inconsistency:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld\n"
+                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, nChannels / numgroups, kH, kW,
+                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
    }
    int err = gpublas_setup(bottom->context->ctx);
@@ -512,19 +564,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
            // Second, gemm
-            for (size_t g = 0; g < numgroups; g++){
+            if (unshared) {
-                err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+              for (size_t g = 0; g < numgroups; ++g) {
-                            N_, M_, K_, 1,
+                for (size_t reg = 0; reg < N_; ++reg){
-                            &col->ga, g * group_col_stride, N_,
+                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                            &weight->ga, g * group_weight_stride, K_,
+                                      1, M_, K_, 1,
-                            0,
+                                      &col->ga, g * group_col_stride + reg, N_,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+                                      &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
+                                      0,
+                                      &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
+            else {
-                PyErr_Format(PyExc_RuntimeError,
+              for (size_t g = 0; g < numgroups; ++g){
-                             "GpuCorrMM forward encountered an error running gemm: %d", err);
+                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                Py_DECREF(col);
+                              N_, M_, K_, 1,
-                return NULL;
+                              &col->ga, g * group_col_stride, N_,
+                              &weight->ga, g * group_weight_stride, K_,
+                              0,
+                              &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
        }
    }
@@ -557,19 +627,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            for(size_t g = 0; g < numgroups; g++){ 
+            if (unshared) {
-                err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+              for (size_t g = 0; g < numgroups; ++g) {
-                            K_, M_, N_, 1,
+                for (size_t reg = 0; reg < N_; ++reg){
-                            &col->ga, g * group_col_stride, N_,
+                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                              K_, M_, 1, 1,
-                            (n == 0) ? 0 : 1,
+                              &col->ga, g * group_col_stride + reg, N_,
-                            &weight->ga, g * group_weight_stride, K_);
+                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
+                              (n == 0) ? 0 : 1,
+                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
+            else{
-                PyErr_Format(PyExc_RuntimeError,
+              for(size_t g = 0; g < numgroups; g++){ 
-                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
+                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                Py_DECREF(col);
+                              K_, M_, N_, 1,
-                return NULL;
+                              &col->ga, g * group_col_stride, N_,
+                              &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                              (n == 0) ? 0 : 1,
+                              &weight->ga, g * group_weight_stride, K_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
        }
    }
@@ -590,19 +678,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
-            for(size_t g = 0; g < numgroups; g++){
+            if (unshared) {
-              err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+              for (size_t g = 0; g < numgroups; ++g){
-                          N_, K_, M_, 1,
+                for (size_t reg = 0; reg < N_; ++reg) {
-                          &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                  err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                          &weight->ga, g * group_weight_stride, K_,
+                              1, K_, M_, 1,
-                          0,
+                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
-                          &col->ga, g * group_col_stride, N_);
+                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
+                              0,
+                              &col->ga, g * group_col_stride + reg, N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
+            else {
-                PyErr_Format(PyExc_RuntimeError,
+              for (size_t g = 0; g < numgroups; ++g){
-                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
+                err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                Py_DECREF(col);
+                            N_, K_, M_, 1,
-                return NULL;
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                            &weight->ga, g * group_weight_stride, K_,
+                            0,
+                            &col->ga, g * group_col_stride, N_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
        return None
+    if op.unshared:
+        return None
    inp1 = inputs[0]
    inp2 = inputs[1]
@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d):
@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):
    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
+    if node.op.unshared:
+        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]
@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradWeights):
@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradInputs):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
+    flip = (slice(None),) * (kern.ndim - 2) + \
+        (slice(None, None, -1),) * 2
+    kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
+    if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
        if not node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # need to dimshuffle the kernel for full convolution
-        kern = kern.dimshuffle(1, 0, 2, 3)
+        kern = kern.dimshuffle(kern_axes)
        # call GpuCorrMM_gradInputs
        rval = GpuCorrMM_gradInputs('valid',
                                    subsample,
@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation,
-                         node.op.num_groups)(gpu_contiguous(img),
+                         num_groups,
-                                             gpu_contiguous(kern))
+                         unshared)(gpu_contiguous(img),
+                                   gpu_contiguous(kern))
        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
                (node.op.kshp is not None) and
                (None not in node.op.kshp) and
                border_mode != "half" and
-                node.op.num_groups == 1):
+                num_groups == 1 and
+                not unshared):
            # we know the kernel and output size
-            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod1 = node.op.kshp[0] * node.op.kshp[-3]
            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                     (node.op.imshp[-1] - node.op.kshp[-3] + 1))
            if (None not in node.op.imshp[:1]):
                # we also know batchsize and input channels
                prod1 *= node.op.imshp[0]
@@ -1641,7 +1648,8 @@ def local_abstractconv_gemm(node):
            if prod1 > prod2:
                rval = GpuCorrMM_gradWeights(border_mode,
                                             subsample,
-                                             filter_dilation)(
+                                             filter_dilation,
+                                             unshared)(
                    gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
                    gpu_contiguous(kern.dimshuffle(1, 0, 2, 3)))
                # (we need to wrap the result in as_gpuarray_variable,
@@ -1690,8 +1698,9 @@ def local_abstractconv_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
@@ -1702,7 +1711,7 @@ def local_abstractconv_gemm_alt(node):
            gpu_contiguous(kern), gpu_contiguous(img))
    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-          num_groups == 1):
+          num_groups == 1 and not unshared):
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
@@ -1896,10 +1905,13 @@ def local_abstractconv_gradweights_gemm(node):
    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
                                 filter_dilation=node.op.filter_dilation,
-                                 num_groups=node.op.num_groups)(
+                                 num_groups=node.op.num_groups,
+                                 unshared=node.op.unshared)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    flip = (slice(None),) * (rval.ndim - 2) + \
+        (slice(None, None, -1),) * 2
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        rval = rval[flip]
    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    rval = as_gpuarray_variable(rval, context_name=ctx)
    return [rval]
@@ -1918,9 +1930,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-       num_groups == 1):
+       num_groups == 1 and not unshared):
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation)(
@@ -2001,12 +2014,15 @@ def local_abstractconv_gradinputs_gemm(node):
        return None
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
                                filter_dilation=node.op.filter_dilation,
-                                num_groups=node.op.num_groups)(
+                                num_groups=node.op.num_groups,
+                                unshared=node.op.unshared)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]
@@ -2023,8 +2039,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared
-    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -8,10 +8,10 @@ from theano.tests import unittest_tools as utt
 from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs
-from ..type import gpuarray_shared_constructor
+from theano.gpuarray.type import gpuarray_shared_constructor
-from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
+from theano.gpuarray.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu, ref_cast
+from config import mode_with_gpu, mode_without_gpu, ref_cast
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
 class TestCorrMM(unittest.TestCase):
@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
                       border_mode='valid',
                       filter_dilation=(1, 1),
                       subsample=(1, 1),
+                       unshared=False,
                       verify_grad=False):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        if unshared:
+            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
+        else:
+            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)
@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):
        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(ref_cast(inputs),
+                          subsample=subsample,
-                                               ref_cast(filters))
+                          unshared=unshared)(ref_cast(inputs),
+                                             ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        conv = GpuCorrMM(border_mode=border_mode,
                         filter_dilation=filter_dilation,
-                         subsample=subsample)(inputs, filters)
+                         subsample=subsample,
+                         unshared=unshared)(inputs, filters)
        f = theano.function([], conv, mode=mode_with_gpu)
        res_ref = f_ref()
@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
                                      filter_dilation=filter_dilation,
-                                      subsample=subsample),
+                                      subsample=subsample,
+                                      unshared=unshared),
                            [inputs_val, filters_val], mode=mode_with_gpu)
    def test_valid(self):
@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(3, 3))
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(3, 3))
@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
                                    border_mode=border_mode,
                                    verify_grad=True)
+    def test_unshared(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 8, 1, 6, 12, 1),
+                            subsample=(2, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 3), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            subsample=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode='valid', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 13, 6, 12, 1),
+                            border_mode='half', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 25, 23, 6, 12, 1),
+                            border_mode='full', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode=(0, 0), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 17, 5, 6, 12, 1),
+                            border_mode=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 5, 6, 12, 1),
+                            border_mode=(3, 2), unshared=True)
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
@@ -227,3 +263,17 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
    conv_op = GpuCorrMM
    conv_gradw_op = GpuCorrMM_gradWeights
    conv_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+class TestUnsharedGpuCorr2d(TestUnsharedConv):
+    mode = theano.compile.get_mode("FAST_RUN")
+    conv2d = GpuCorrMM
+    conv2d_gradw = GpuCorrMM_gradWeights
+    conv2d_gradi = GpuCorrMM_gradInputs
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d
 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        (batch size, input channels, input rows, input columns).
        See the optional parameter ``input_shape``.
-    filters: symbolic 4D tensor
+    filters: symbolic 4D or 6D tensor
        Set of filters used in CNN layer of shape
-        (output channels, input channels, filter rows, filter columns).
+        (output channels, input channels, filter rows, filter columns)
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
        See the optional parameter ``filter_shape``.
-    input_shape: None, tuple/list of len 4 of int or Constant variable
+    input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the input parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.
-    filter_shape: None, tuple/list of len 4 of int or Constant variable
+    filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.
@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
    return abstract_conv2d(input, filters, input_shape, filter_shape,
                           border_mode, subsample, filter_flip,
-                           filter_dilation, num_groups)
+                           filter_dilation, num_groups, unshared)
 def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                     border_mode='valid', input_dilation=(1, 1),
-                     filter_flip=True, filter_dilation=(1, 1), num_groups=1):
+                     filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
    """
    This function will build the symbolic graph for applying a transposed
    convolution over a mini-batch of a stack of 2D inputs with a set of 2D
@@ -215,6 +224,11 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
    Returns
    -------
    Symbolic 4D tensor
@@ -242,4 +256,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                                  subsample=input_dilation,
                                  filter_flip=filter_flip,
                                  filter_dilation=filter_dilation,
-                                  num_groups=num_groups)
+                                  num_groups=num_groups,
+                                  unshared=unshared)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -44,9 +44,13 @@ def get_conv_output_shape(image_shape, kernel_shape,
        to: batch size, number of input channels, height and width (and
        possibly depth) of the image. None where undefined.
    kernel_shape: tuple of int (symbolic or numeric) corresponding to the
-        kernel shape. Its four (or five) elements must correspond respectively
+        kernel shape. For a normal convolution, its four (or five) elements
-        to: number of output channels, number of input channels, height and
+        must correspond respectively to : number of output channels, number of
-        width (and possibly depth) of the kernel. None where undefined.
+        input channels, height and width (and possibly depth) of the kernel.
+        For an unshared convolution, its six channels must correspond to :
+        number of output channels, height and width
+        of the output, number of input channels, height and width of the kernel.
+        None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
        If it is a tuple, its two (or three) elements respectively correspond
@@ -65,7 +69,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
    """
    bsize, imshp = image_shape[0], image_shape[2:]
-    nkern, kshp = kernel_shape[0], kernel_shape[2:]
+    convdim = len(image_shape) - 2
+    nkern, kshp = kernel_shape[0], kernel_shape[-convdim:]
    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')
@@ -139,7 +146,7 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
 def get_conv_gradweights_shape(image_shape, top_shape,
                               border_mode, subsample,
                               filter_dilation=None,
-                               num_groups=1):
+                               num_groups=1, unshared=False):
    """
    This function tries to compute the kernel shape of convolution gradWeights.
@@ -194,7 +201,10 @@ def get_conv_gradweights_shape(image_shape, top_shape,
        out_shp = tuple(get_conv_gradweights_shape_1axis(
            imshp[i], topshp[i], border_mode,
            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
-    return (nchan, nkern) + out_shp
+    if unshared:
+        return (nchan,) + top_shape[2:] + (nkern,) + out_shp
+    else:
+        return (nchan, nkern) + out_shp
 def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
@@ -289,7 +299,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
    """
    bsize, topshp = top_shape[0], top_shape[2:]
-    nkern, kshp = kernel_shape[1], kernel_shape[2:]
+    convdim = len(topshp) - 2
+    nkern, kshp = kernel_shape[1], kernel_shape[-convdim:]
    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')
@@ -522,7 +534,8 @@ def conv2d(input,
           subsample=(1, 1),
           filter_flip=True,
           filter_dilation=(1, 1),
-           num_groups=1):
+           num_groups=1,
+           unshared=False):
    """This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
@@ -538,7 +551,8 @@ def conv2d(input,
                             subsample=subsample,
                             filter_flip=filter_flip,
                             filter_dilation=filter_dilation,
-                             num_groups=num_groups)
+                             num_groups=num_groups,
+                             unshared=unshared)
    return conv_op(input, filters)
@@ -878,7 +892,6 @@ def conv3d(input,
        version until it is released.
    """
    input = as_tensor_variable(input)
    filters = as_tensor_variable(filters)
    conv_op = AbstractConv3d(imshp=input_shape,
@@ -899,7 +912,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                           subsample=(1, 1),
                           filter_flip=True,
                           filter_dilation=(1, 1),
-                           num_groups=1):
+                           num_groups=1,
+                           unshared=False):
    """Compute conv output gradient w.r.t its inputs
    This function builds the symbolic graph for getting the
@@ -916,10 +930,14 @@ def conv2d_grad_wrt_inputs(output_grad,
        will be upsampled or the output gradient of the convolution
        whose gradient will be taken with respect to the input of the
        convolution.
-    filters : symbolic 4D tensor
+    filters: symbolic 4D or 6D tensor
-        set of filters used in CNN layer of shape (output channels,
+        Set of filters used in CNN layer of shape
-        input channels, filter rows, filter columns).  See the
+        (output channels, input channels, filter rows, filter columns)
-        optional parameter ``filter_shape``.
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
+        See the optional parameter ``filter_shape``.
    input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
        The shape of the input (upsampled) parameter.
        A tuple/list of len 4, with the first two dimensions
@@ -928,8 +946,9 @@ def conv2d_grad_wrt_inputs(output_grad,
        Not Optional, since given the output_grad shape
        and the subsample values, multiple input_shape may be
        plausible.
-    filter_shape : None or [None/int/Constant] * 4
+    filter_shape : None or [None/int/Constant] * (4 or 6)
-        The shape of the filters parameter. None or a tuple/list of len 4.
+        The shape of the filters parameter. None or a tuple/list of len 4 or a
+        tuple/list of len 6 (for unshared convolution)
        Optional, possibly used  to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that
        this element is not known at compile time.
@@ -975,6 +994,10 @@ def conv2d_grad_wrt_inputs(output_grad,
    num_groups : int
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
    Returns
    -------
@@ -1012,6 +1035,10 @@ def conv2d_grad_wrt_inputs(output_grad,
        for dim in [0, 1, 2, 3]:
            assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
                                                  integer_types, type(None)))
+        if unshared:
+            for dim in [4, 5]:
+                assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
+                                                      integer_types, type(None)))
    # setting the last two dimensions of input_shape to None, if
    # the type of these dimensions is TensorVariable.
@@ -1026,7 +1053,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                                              subsample=subsample,
                                              filter_flip=filter_flip,
                                              filter_dilation=filter_dilation,
-                                              num_groups=num_groups)
+                                              num_groups=num_groups,
+                                              unshared=unshared)
    return grad_input_op(filters, output_grad, input_shape[-2:])
@@ -1179,7 +1207,8 @@ def conv2d_grad_wrt_weights(input,
                            subsample=(1, 1),
                            filter_flip=True,
                            filter_dilation=(1, 1),
-                            num_groups=1):
+                            num_groups=1,
+                            unshared=False):
    """Compute conv output gradient w.r.t its weights
    This function will build the symbolic graph for getting the
@@ -1195,10 +1224,10 @@ def conv2d_grad_wrt_weights(input,
        mini-batch of feature map stacks, of shape (batch size, input
        channels, input rows, input columns).  This is the gradient of
        the output of convolution.
-    filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
+    filter_shape : [None/int/Constant] * (2 or 4) + [Tensor/int/Constant] * 2
-        The shape of the filter parameter.  A tuple/list of len 4, with the
+        The shape of the filter parameter.  A tuple/list of len 4 or 6
-        first two dimensions being None or int or Constant and the last two
+        (for unshared), with the first two dimensions being None or int or
-        dimensions being Tensor or int or Constant.
+        Constant and the last two dimensions being Tensor or int or Constant.
        Not Optional, since given the output_grad shape and
        the input_shape, multiple filter_shape may be plausible.
    input_shape : None or [None/int/Constant] * 4
@@ -1247,13 +1276,19 @@ def conv2d_grad_wrt_weights(input,
    num_groups : int
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
    Returns
    -------
-    symbolic 4D tensor
+    symbolic 4D tensor or 6D tensor
        set of feature maps generated by convolutional layer. Tensor
        is of shape (batch size, output channels, output rows, output
-        columns)
+        columns) for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns) for unshared convolution
    Notes
    -----
@@ -1274,7 +1309,11 @@ def conv2d_grad_wrt_weights(input,
    for dim in [0, 1]:
        assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
                                              integer_types, type(None)))
-    for dim in [2, 3]:
+    if unshared:
+        for dim in [2, 3]:
+            assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
+                                                  integer_types, type(None)))
+    for dim in [-2, -1]:
        assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable,
                                              theano.tensor.TensorConstant,
                                              integer_types))
@@ -1288,7 +1327,7 @@ def conv2d_grad_wrt_weights(input,
    # setting the last two dimensions of filter_shape to None, if
    # the type of these dimensions is TensorVariable.
    numerical_filter_shape = list(filter_shape)
-    for dim in [2, 3]:
+    for dim in [-2, -1]:
        if isinstance(filter_shape[dim], theano.tensor.TensorVariable):
            numerical_filter_shape[dim] = None
@@ -1298,7 +1337,8 @@ def conv2d_grad_wrt_weights(input,
                                               subsample=subsample,
                                               filter_flip=filter_flip,
                                               filter_dilation=filter_dilation,
-                                               num_groups=num_groups)
+                                               num_groups=num_groups,
+                                               unshared=unshared)
    return gradWeight_op(input, output_grad, filter_shape[-2:])
@@ -1631,7 +1671,8 @@ class BaseAbstractConv(Op):
        element is not known at compile time.
        imshp is defined w.r.t the forward conv.
-     kshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable
+     kshp: None, tuple/list of len ``(2 + convdim)`` or ``(2 + 2 * convdim)``
+        (for unshared) of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -1670,14 +1711,20 @@ class BaseAbstractConv(Op):
    filter_dilation: tuple of len ``convdim``
        Factor by which to subsample (stride) the input.
        Also called dilation factor.
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different kernel will be used for each region of the
+        input.
    """
    check_broadcast = False
    __props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
-                 'imshp', 'kshp', 'filter_dilation', 'num_groups')
+                 'imshp', 'kshp', 'filter_dilation', 'num_groups', 'unshared')
    def __init__(self, convdim,
                 imshp=None, kshp=None, border_mode="valid",
-                 subsample=None, filter_flip=True, filter_dilation=None, num_groups=1):
+                 subsample=None, filter_flip=True, filter_dilation=None, num_groups=1,
+                 unshared=False):
        self.convdim = convdim
        if convdim not in (2, 3):
@@ -1718,7 +1765,10 @@ class BaseAbstractConv(Op):
                            ValueError("imshp should be None or a tuple of "
                                       "constant int values"),
                            sys.exc_info()[2])
-        self.kshp = tuple(kshp) if kshp else (None,) * (2 + convdim)
+        if kshp:
+            self.kshp = tuple(kshp)
+        else:
+            self.kshp = (None,) * ((2 + 2 * convdim) if unshared else (2 + convdim))
        for kshp_i in self.kshp:
            if kshp_i is not None:
                # Components of kshp should be constant or ints
@@ -1742,6 +1792,10 @@ class BaseAbstractConv(Op):
        if num_groups < 1:
            raise ValueError("num_groups must have value greater than zero")
        self.num_groups = num_groups
+        if unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
+        self.unshared = unshared
    def do_constant_folding(self, node):
        # Disable constant folding since there is no implementation.
@@ -1768,11 +1822,10 @@ class BaseAbstractConv(Op):
            raise NotImplementedError(
                'flops not implemented for convdim={}', self.convdim)
-    def conv(self, img, kern, mode="valid", dilation=1, num_groups=1):
+    def conv(self, img, kern, mode="valid", dilation=1, num_groups=1, unshared=False, direction="forward"):
        """
        Basic slow Python 2D or 3D convolution for DebugMode
        """
        if not imported_scipy_signal:
            raise NotImplementedError(
                "AbstractConv perform requires the python package"
@@ -1787,18 +1840,27 @@ class BaseAbstractConv(Op):
            raise ValueError(
                'invalid dilation {}, expected {} values'.format(dilation,
                                                                 self.convdim))
+        if unshared and direction == "backprop weights":
+            if mode != "valid":
+                raise ValueError('conv mode for unshared backprop wrt weights must be "valid"')
+            # Do a transpose later to bring it to required shape
+            out_shape = (img.shape[0], kern.shape[0],
+                         kern.shape[2], kern.shape[3],
+                         img.shape[2] - kern.shape[2] + 1,
+                         img.shape[3] - kern.shape[3] + 1)
+        else:
+            out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                              mode, [1] * self.convdim, dilation)
-        out_shape = get_conv_output_shape(img.shape, kern.shape,
-                                          mode, [1] * self.convdim, dilation)
-        out = np.zeros(out_shape, dtype=img.dtype)
        dil_kern_shp = kern.shape[:-self.convdim] + tuple(
            (kern.shape[-self.convdim + i] - 1) * dilation[i] + 1
            for i in range(self.convdim))
        dilated_kern = np.zeros(dil_kern_shp, dtype=kern.dtype)
-        dilated_kern[(slice(None), slice(None)) +
+        dilated_kern[(slice(None),) * (dilated_kern.ndim - self.convdim) +
                     tuple(slice(None, None, dilation[i]) for i in range(self.convdim))
                     ] = kern
+        out = np.zeros(out_shape, dtype=img.dtype)
        if img.shape[1] % self.num_groups != 0:
            raise ValueError(
@@ -1823,11 +1885,19 @@ class BaseAbstractConv(Op):
                    for g in xrange(self.num_groups):
                        for n in xrange(output_channel_offset):
                            for im0 in xrange(input_channel_offset):
-                                # some cast generates a warning here
+                                if unshared:
-                                out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
+                                    out[b, g * output_channel_offset + n, ...] += self.unshared2d(img[b, g * input_channel_offset + im0, ...],
-                                                                                          dilated_kern[g * output_channel_offset + n,
+                                                                                                  dilated_kern[g * output_channel_offset + n, im0, ...],
-                                                                                          im0, ...], 1, val, bval, 0)
+                                                                                                  out_shape[2:], direction)
+                                else:
+                                    # some cast generates a warning here
+                                    out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
+                                                                                              dilated_kern[g * output_channel_offset + n, im0, ...],
+                                                                                              1, val, bval, 0)
        elif self.convdim == 3:
+            if unshared:
+                raise NotImplementedError('Unshared 3D convolution is not implemented')
            for b in xrange(img.shape[0]):
                for g in xrange(self.num_groups):
                    for n in xrange(output_channel_offset):
@@ -1839,6 +1909,35 @@ class BaseAbstractConv(Op):
            raise NotImplementedError('only 2D and 3D convolution are implemented')
        return out
+    def unshared2d(self, inp, kern, out_shape, direction="forward"):
+        '''
+        Basic slow Python unshared 2d convolution.
+        '''
+        if self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
+        out = np.zeros(out_shape, dtype=inp.dtype)
+        if direction == "forward":
+            for row in xrange(out_shape[0]):
+                for col in xrange(out_shape[1]):
+                    out[row, col] = np.sum(np.multiply(inp[row:row + kern.shape[2],
+                                                       col:col + kern.shape[3]],
+                                           kern[row, col, ::-1, ::-1]))
+        elif direction == "backprop weights":
+            for row in xrange(out_shape[0]):
+                for col in xrange(out_shape[1]):
+                    out[row, col, ...] = kern[row, col] * \
+                        inp[row:row + out_shape[2], col:col + out_shape[3]]
+        elif direction == "backprop inputs":
+            for row in xrange(kern.shape[0]):
+                for col in xrange(kern.shape[1]):
+                    out[row:row + kern.shape[2], col:col + kern.shape[3]] += inp[row, col] * \
+                        kern[row, col, ::-1, ::-1]
+        else:
+            raise ValueError("unshared2d: invalid value '{}' for 'direction'".format(direction))
+        return out
 class AbstractConv(BaseAbstractConv):
    """ Abstract Op for the forward convolution.
@@ -1854,14 +1953,16 @@ class AbstractConv(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv, self).__init__(convdim=convdim,
                                           imshp=imshp, kshp=kshp,
                                           border_mode=border_mode,
                                           subsample=subsample,
                                           filter_flip=filter_flip,
                                           filter_dilation=filter_dilation,
-                                           num_groups=num_groups)
+                                           num_groups=num_groups,
+                                           unshared=unshared)
    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -1875,8 +1976,14 @@ class AbstractConv(BaseAbstractConv):
        if img.type.ndim != 2 + self.convdim:
            raise TypeError('img must be %dD tensor' % (2 + self.convdim))
-        if kern.type.ndim != 2 + self.convdim:
-            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+        if self.unshared:
+            if kern.type.ndim != 2 + 2 * self.convdim:
+                raise TypeError('kern must be %dD tensor for unshared convolution'
+                                % (2 + 2 * self.convdim))
+        else:
+            if kern.type.ndim != 2 + self.convdim:
+                raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
        img = assert_shape(img, self.imshp,
                           'AbstractConv shape mismatch: shape of '
@@ -1894,8 +2001,12 @@ class AbstractConv(BaseAbstractConv):
        img, kern = inp
        img = np.asarray(img)
        kern = np.asarray(kern)
-        dil_kernshp = tuple((kern.shape[2 + i] - 1) * self.filter_dilation[i] + 1
+        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
                            for i in range(self.convdim))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
        o, = out_
        mode = self.border_mode
@@ -1922,8 +2033,34 @@ class AbstractConv(BaseAbstractConv):
                          for i in range(self.convdim))] = img
            img = new_img
        if not self.filter_flip:
-            kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim]
+            kern = kern[(slice(None),) * (kern.ndim - self.convdim) + (slice(None, None, -1),) * self.convdim]
-        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups)
+        if self.unshared:
+            out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                              mode, self.subsample, self.filter_dilation)
+            if kern.shape[1:1 + self.convdim] != out_shape[2:2 + self.convdim]:
+                raise ValueError('Kernel shape {} does not match '
+                                 'computed output size {}'.format(kern.shape[1:1 + self.convdim],
+                                                                  out_shape[2:2 + self.convdim]))
+            if any(self.subsample[i] > 1 for i in range(self.convdim)):
+                # Expand regions in kernel to correct for subsampling
+                out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                                  mode, (1,) * self.convdim, self.filter_dilation)
+                exp_kern_shp = kern.shape[:1] + out_shape[2:2 + self.convdim] + \
+                    kern.shape[1 + self.convdim:]
+                exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
+                exp_kern[(slice(None),) +
+                         tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
+                         (slice(None),) * (self.convdim + 1)] = kern
+                kern = exp_kern
+            # from (nFilters, out_rows, out_cols, nChannels, kH, kW)
+            # to (nFilters, nChannels, out_rows, out_cols, kH, kW)
+            axes_order = (0, 1 + self.convdim,) + tuple(range(1, 1 + self.convdim)) + \
+                tuple(range(2 + self.convdim, kern.ndim))
+            kern = kern.transpose(axes_order)
+        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups,
+                             unshared=self.unshared)
        conv_out = conv_out[(slice(None), slice(None)) +
                            tuple(slice(None, None, self.subsample[i])
                                  for i in range(self.convdim))]
@@ -1934,6 +2071,8 @@ class AbstractConv(BaseAbstractConv):
        if self.num_groups > 1:
            raise NotImplementedError(
                'Rop not implemented for grouped convolutions')
+        if self.unshared:
+            raise NotImplementedError('Rop not implemented for unshared convolution')
        rval = None
        if eval_points[0] is not None:
            rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
@@ -1953,8 +2092,12 @@ class AbstractConv(BaseAbstractConv):
            imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
                     for i in range(2 + self.convdim)]
        if self.kshp is not None:
-            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+            if self.unshared:
-                    for i in range(2 + self.convdim)]
+                kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+                        for i in range(2 + 2 * self.convdim)]
+            else:
+                kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+                        for i in range(2 + self.convdim)]
        res = get_conv_output_shape(imshp, kshp, self.border_mode,
                                    self.subsample, self.filter_dilation)
        return [res]
@@ -1973,14 +2116,16 @@ class AbstractConv2d(AbstractConv):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d, self).__init__(convdim=2,
                                             imshp=imshp, kshp=kshp,
                                             border_mode=border_mode,
                                             subsample=subsample,
                                             filter_flip=filter_flip,
                                             filter_dilation=filter_dilation,
-                                             num_groups=num_groups)
+                                             num_groups=num_groups,
+                                             unshared=unshared)
    def grad(self, inp, grads):
        bottom, weights = inp
@@ -1991,14 +2136,16 @@ class AbstractConv2d(AbstractConv):
                                             self.subsample,
                                             self.filter_flip,
                                             self.filter_dilation,
-                                             num_groups=self.num_groups)(
+                                             num_groups=self.num_groups,
+                                             unshared=self.unshared)(
            weights, top, bottom.shape[-2:], add_assert_shape=False)
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
                                               self.subsample,
                                               self.filter_flip,
                                               self.filter_dilation,
-                                               num_groups=self.num_groups)(
+                                               num_groups=self.num_groups,
+                                               unshared=self.unshared)(
            bottom, top, weights.shape[-2:], add_assert_shape=False)
@@ -2085,14 +2232,16 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
                                                       imshp=imshp, kshp=kshp,
                                                       border_mode=border_mode,
                                                       subsample=subsample,
                                                       filter_flip=filter_flip,
                                                       filter_dilation=filter_dilation,
-                                                       num_groups=num_groups)
+                                                       num_groups=num_groups,
+                                                       unshared=unshared)
    # Update shape/height_width
    def make_node(self, img, topgrad, shape, add_assert_shape=True):
@@ -2115,8 +2264,12 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                               'image does not match given imshp.')
        shape = as_tensor_variable(shape)
-        broadcastable = [topgrad.broadcastable[1],
+        if self.unshared:
-                         img.broadcastable[1]] + ([False] * self.convdim)
+            broadcastable = [topgrad.broadcastable[1]] + ([False] * self.convdim) + \
+                            [img.broadcastable[1]] + ([False] * self.convdim)
+        else:
+            broadcastable = [topgrad.broadcastable[1],
+                             img.broadcastable[1]] + ([False] * self.convdim)
        output = img.type.clone(broadcastable=broadcastable)()
        return Apply(self, [img, topgrad, shape], [output])
@@ -2134,6 +2287,9 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                'invalid border_mode {}, which must be either '
                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
        dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
                          for i in range(self.convdim))
@@ -2166,9 +2322,7 @@ class AbstractConv_gradWeights(BaseAbstractConv):
            topgrad = new_topgrad
        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
-        flip_filters = ((slice(None), slice(None)) +
+        topgrad = topgrad.transpose(axes_order)
-                        (slice(None, None, -1),) * self.convdim)
-        topgrad = topgrad.transpose(axes_order)[flip_filters]
        img = img.transpose(axes_order)
        def correct_for_groups(mat):
@@ -2182,15 +2336,36 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        if self.num_groups > 1:
            img = correct_for_groups(img)
-        kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
+        if self.unshared:
+            flip_kern = ((slice(None),) * (2 + self.convdim) +
+                         (slice(None, None, -1),) * self.convdim)
+            kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups,
+                             unshared=True, direction="backprop weights")
+            if any(self.subsample[i] > 1 for i in range(self.convdim)):
+                sub_slice = (slice(None),) * 2 + \
+                    tuple(slice(None, None, self.subsample[i]) for i in range(0, self.convdim)) + \
+                    (slice(None),) * self.convdim
+                kern = kern[sub_slice]
+            # from (nChannels, nFilters, out_rows, out_cols, kH, kW)
+            # to (nFilters, out_rows, out_cols, nChannels, kH, kW)
+            kern_axes = (1,) + tuple(range(2, self.convdim + 2)) + (0,) + \
+                tuple(range(self.convdim + 2, kern.ndim))
+        else:
+            flip_topgrad = flip_kern = ((slice(None), slice(None)) +
+                                        (slice(None, None, -1),) * self.convdim)
+            topgrad = topgrad[flip_topgrad]
+            kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
+            kern_axes = (1, 0) + tuple(range(2, self.convdim + 2))
+        kern = kern.transpose(kern_axes)
        if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
-            kern = kern[(slice(None), slice(None)) +
+            kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
                        tuple(slice(None, None, self.filter_dilation[i])
                              for i in range(self.convdim))]
        if self.filter_flip:
-            kern = kern.transpose(axes_order)[flip_filters]
+            kern = kern[flip_kern]
-        else:
-            kern = kern.transpose(axes_order)
        o[0] = node.outputs[0].type.filter(kern)
    def connection_pattern(self, node):
@@ -2203,15 +2378,24 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        # from the shapes of inputs.
        imshp = input_shapes[0]
        topshp = input_shapes[1]
-        kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
-        if self.num_groups > 1:
+        if self.kshp:
-            fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
+            kshp = self.kshp
+        else:
+            if self.unshared:
+                kshp = [None] * (2 + 2 * self.convdim)
+            else:
+                kshp = [None] * (2 + self.convdim)
+        if self.unshared:
+            fallback_kshp = ([topshp[1], topshp[2], topshp[3], imshp[1] // self.num_groups] +
                             [node.inputs[2][i] for i in range(self.convdim)])
+            kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
+                    for i in range(2 + 2 * self.convdim)]
        else:
-            fallback_kshp = ([topshp[1], imshp[1]] +
+            fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
                             [node.inputs[2][i] for i in range(self.convdim)])
-        kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
+            kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
-                for i in range(2 + self.convdim)]
+                    for i in range(2 + self.convdim)]
        return [kshp]
@@ -2232,14 +2416,16 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
                                                         imshp=imshp, kshp=kshp,
                                                         border_mode=border_mode,
                                                         subsample=subsample,
                                                         filter_flip=filter_flip,
                                                         filter_dilation=filter_dilation,
-                                                         num_groups=num_groups)
+                                                         num_groups=num_groups,
+                                                         unshared=unshared)
    def grad(self, inp, grads):
        bottom, top = inp[:2]
@@ -2249,16 +2435,18 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                                             self.subsample,
                                             self.filter_flip,
                                             self.filter_dilation,
-                                             self.num_groups)(weights,
+                                             self.num_groups,
-                                                              top,
+                                             self.unshared)(weights,
-                                                              bottom.shape[-2:])
+                                                            top,
+                                                            bottom.shape[-2:])
        d_top = AbstractConv2d(self.imshp,
                               self.kshp,
                               self.border_mode,
                               self.subsample,
                               self.filter_flip,
                               self.filter_dilation,
-                               self.num_groups)(bottom, weights)
+                               self.num_groups,
+                               self.unshared)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.
@@ -2350,14 +2538,16 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
                                                      imshp=imshp, kshp=kshp,
                                                      border_mode=border_mode,
                                                      subsample=subsample,
                                                      filter_flip=filter_flip,
                                                      filter_dilation=filter_dilation,
-                                                      num_groups=num_groups)
+                                                      num_groups=num_groups,
+                                                      unshared=unshared)
    # Update shape/height_width
    def make_node(self, kern, topgrad, shape, add_assert_shape=True):
@@ -2370,10 +2560,19 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                                broadcastable=topgrad.broadcastable)
        topgrad = gtype.filter_variable(topgrad)
-        if kern.type.ndim != 2 + self.convdim:
+        if self.unshared:
-            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+            if self.convdim != 2:
+                raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                          % self.convdim)
+            elif kern.type.ndim != 2 + 2 * self.convdim:
+                raise TypeError('kern must be %dD tensor for unshared convolution'
+                                % (2 + 2 * self.convdim))
+        else:
+            if kern.type.ndim != 2 + self.convdim:
+                raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
        if topgrad.type.ndim != 2 + self.convdim:
-            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
+                raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
        if add_assert_shape:
            kern = assert_shape(kern, self.kshp,
@@ -2386,7 +2585,7 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                             False] + ([False] * self.convdim)
        else:
            broadcastable = [topgrad.type.broadcastable[0],
-                             kern.type.broadcastable[1]] + ([False] * self.convdim)
+                             kern.type.broadcastable[-self.convdim - 1]] + ([False] * self.convdim)
        output = kern.type.clone(broadcastable=broadcastable)()
        return Apply(self, [kern, topgrad, shape], [output])
@@ -2403,9 +2602,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                'invalid border_mode {}, which must be either '
                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
-        fallback_imshp = ([topgrad.shape[0], kern.shape[1]] +
+        fallback_imshp = ([topgrad.shape[0], kern.shape[-self.convdim - 1]] +
                          [shape[i] for i in range(self.convdim)])
        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
                 for i in range(2 + self.convdim)]
@@ -2419,8 +2621,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                'has shape {}'.format(tuple(expected_topgrad_shape),
                                      tuple(topgrad.shape)))
-        dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1
+        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
                            for i in range(self.convdim))
        pad = (0,) * self.convdim
        if mode == "full":
            pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
@@ -2438,25 +2641,54 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                              for i in range(self.convdim))] = topgrad
            topgrad = new_topgrad
-        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
+            if self.unshared:
-        flip_filters = ((slice(None), slice(None)) +
+                # Expand regions in kernel to correct for subsampling
-                        (slice(None, None, -1),) * self.convdim)
+                exp_kern_shp = kern.shape[:1] + topgrad.shape[2:] + kern.shape[1 + self.convdim:]
+                exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
+                exp_kern[(slice(None),) +
+                         tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
+                         (slice(None),) * (self.convdim + 1)] = kern
+                kern = exp_kern
        def correct_for_groups(mat):
            mshp0 = mat.shape[0] // self.num_groups
-            mshp1 = mat.shape[1] * self.num_groups
+            mshp1 = mat.shape[-self.convdim - 1] * self.num_groups
            mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
-            mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
+            if self.unshared:
-            mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
+                # for 2D -> (1, 2, 3, 0, 4, 5, 6)
+                mat = mat.transpose(tuple(range(1, 2 + self.convdim)) + (0,) +
+                                    tuple(range(2 + self.convdim, mat.ndim)))
+                mat = mat.reshape((mshp0,) + mat.shape[1:1 + self.convdim] + (mshp1,) + mat.shape[-self.convdim:])
+            else:
+                mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
+                mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
            return mat
        kern = correct_for_groups(kern)
-        kern = kern.transpose(axes_order)
-        if self.filter_flip:
+        if self.unshared:
-            topgrad = topgrad[flip_filters]
+            # from (nFilters, out_rows, out_cols, nChannels, kH, kW)
-        img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation, num_groups=self.num_groups)
+            # to (nChannels, nFilters, out_rows, out_cols, kH, kW)
-        if self.filter_flip:
+            axes_order = (1 + self.convdim, 0,) + tuple(range(1, 1 + self.convdim)) + \
-            img = img[flip_filters]
+                tuple(range(2 + self.convdim, kern.ndim))
+            kern = kern.transpose(axes_order)
+            if not self.filter_flip:
+                kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
+                            (slice(None, None, -1),) * self.convdim]
+            img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
+                            num_groups=self.num_groups, unshared=True, direction="backprop inputs")
+        else:
+            axes_order = (1, 0) + tuple(range(2, 2 + self.convdim))
+            kern = kern.transpose(axes_order)
+            flip_filters = ((slice(None), slice(None)) +
+                            (slice(None, None, -1),) * self.convdim)
+            if self.filter_flip:
+                topgrad = topgrad[flip_filters]
+            img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
+                            num_groups=self.num_groups)
+            if self.filter_flip:
+                img = img[flip_filters]
        if any(p > 0 for p in pad):
            img = img[(slice(None), slice(None)) +
                      tuple(slice(pad[i], img.shape[i + 2] - pad[i])
@@ -2475,10 +2707,10 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        topshp = input_shapes[1]
        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
        if self.num_groups > 1:
-            fallback_imshp = ([topshp[0], kshp[1] * self.num_groups] +
+            fallback_imshp = ([topshp[0], kshp[-self.convdim - 1] * self.num_groups] +
                              [node.inputs[2][i] for i in range(self.convdim)])
        else:
-            fallback_imshp = ([topshp[0], kshp[1]] +
+            fallback_imshp = ([topshp[0], kshp[-self.convdim - 1]] +
                              [node.inputs[2][i] for i in range(self.convdim)])
        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
                 for i in range(2 + self.convdim)]
@@ -2503,14 +2735,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
                                                        imshp=imshp, kshp=kshp,
                                                        border_mode=border_mode,
                                                        subsample=subsample,
                                                        filter_flip=filter_flip,
                                                        filter_dilation=filter_dilation,
-                                                        num_groups=num_groups)
+                                                        num_groups=num_groups,
+                                                        unshared=unshared)
    def grad(self, inp, grads):
        weights, top = inp[:2]
@@ -2520,7 +2754,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                                               self.subsample,
                                               self.filter_flip,
                                               self.filter_dilation,
-                                               self.num_groups)(
+                                               self.num_groups,
+                                               self.unshared)(
                                                   bottom, top,
                                                   weights.shape[-2:])
        d_top = AbstractConv2d(self.imshp, self.kshp,
@@ -2528,7 +2763,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                               self.subsample,
                               self.filter_flip,
                               self.filter_dilation,
-                               self.num_groups)(bottom, weights)
+                               self.num_groups,
+                               self.unshared)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.

--- a/theano/tensor/nnet/c_code/corr_gemm.c
+++ b/theano/tensor/nnet/c_code/corr_gemm.c
@@ -107,7 +107,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                      const int dilW = 1,
                      const int padH = 0,
                      const int padW = 0,
-                      const int numgroups = 1)
+                      const int numgroups = 1,
+                      const int unshared = 0)
 {
    if (PyArray_NDIM(bottom) != 4)
    {
@@ -120,9 +121,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        return NULL;
    }
-    if (PyArray_NDIM(weight) != 4)
+    if (PyArray_NDIM(weight) != (unshared ? 6 : 4))
    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D");
+        PyErr_Format(PyExc_ValueError, "CorrMM requires weight of %%dD", unshared ? 6 : 4);
        return NULL;
    }
    if (PyArray_TYPE(weight) != %(float_typenum)s)
@@ -152,11 +153,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int nChannels = PyArray_DIMS(bottom)[1];
    const int bottomHeight = PyArray_DIMS(bottom)[2];
    const int bottomWidth = PyArray_DIMS(bottom)[3];
-    // weights: (nFilters, nChannels, rows, columns)
+    // normal weights: (nFilters, nChannels, rows, columns)
+    // unshared weights: (nFilters, topHeight, topWidth, nChannels, rows, columns)
    const int nFilters = PyArray_DIMS(weight)[0];
-    const int kH = PyArray_DIMS(weight)[2];
+    const int kH = PyArray_DIMS(weight)[unshared ? 4 : 2];
-    const int kW = PyArray_DIMS(weight)[3];
+    const int kW = PyArray_DIMS(weight)[unshared ? 5 : 3];
-    if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) {
+    if (nChannels != PyArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
        PyErr_SetString(PyExc_ValueError,
                "CorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -179,22 +181,57 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
 #undef _CONV_FLOORDIV
-    if (batchSize != PyArray_DIMS(top)[0] ||
+    if (unshared) {
-            nFilters != PyArray_DIMS(top)[1] ||
+        if (topHeight != PyArray_DIMS(weight)[1] ||
-            topHeight != PyArray_DIMS(top)[2] ||
+                topWidth != PyArray_DIMS(weight)[2]) {
-            topWidth != PyArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
-        PyErr_Format(PyExc_ValueError,
+                    "CorrMM regions in kernel must match output regions:\n"
-                "CorrMM shape inconsistency:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
-                "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%ld %%ld %%d %%d %%d"
-                "  weight shape: %%d %%d %%d %%d\n"
+                    " (expected %%d %%d %%d %%d %%d %%d)\n"
-                "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
+                    "  top shape(calculated): %%d %%d %%d %%d\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
+                    batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels / numgroups, kH, kW,
+                    nFilters, PyArray_DIMS(weight)[1],
-                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                    PyArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
-                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
-                batchSize, nFilters, topHeight, topWidth);
+                    batchSize, nFilters, topHeight, topWidth);
-        return NULL;
+            return NULL;
+        }
+        if (batchSize != PyArray_DIMS(top)[0] ||
+                nFilters != PyArray_DIMS(top)[1] ||
+                topHeight != PyArray_DIMS(top)[2] ||
+                topWidth != PyArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "CorrMM shape inconsistency:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%d %%d %%d %%d %%d\n"
+                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
    }
+    else {
+        if (batchSize != PyArray_DIMS(top)[0] ||
+                nFilters != PyArray_DIMS(top)[1] ||
+                topHeight != PyArray_DIMS(top)[2] ||
+                topWidth != PyArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "CorrMM shape inconsistency:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%d %%d %%d\n"
+                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, nChannels / numgroups, kH, kW,
+                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+    }        
    // Create temporary columns
    int max_threads = %(omp_get_max_threads)s;
@@ -230,8 +267,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int group_col_stride = (K_ * N_);
    const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
    const int M_ = nFilters / numgroups;
+    const int one_int = 1;
    const %(c_float_type)s one = 1.0;
    const %(c_float_type)s zero = 0.0;
+    const int ldw = (K_ * N_);
    char NTrans = 'N';
    char Trans = 'T';
    PyArrayObject *output;
@@ -266,15 +305,30 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            for ( int g = 0; g < numgroups; ++g){
+            // Second, gemm
-                // Second, gemm
+            if (unshared) {
-                %(gemm)s(&NTrans, &NTrans,
+                for (int g = 0; g < numgroups; ++g) {
-                       &N_, &M_, &K_,
+                    for (int reg = 0; reg < N_; ++reg) {
-                       &one,
+                        %(gemv)s(&Trans, &K_, &M_,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                                &one,
-                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                                (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
-                       &zero,
+                                (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
-                       (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
+                                &zero,
+                                (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride + reg, &N_);
+                    }
+                }
+            }
+            else {
+                for ( int g = 0; g < numgroups; ++g){
+                    // Second, gemm
+                    %(gemm)s(&NTrans, &NTrans,
+                           &N_, &M_, &K_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                           &zero,
+                           (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);  
+                }
            }
        }
        // Restore to previous blas threads
@@ -316,7 +370,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        output = weight;
        npy_intp weight_dim[2];
        weight_dim[0] = (npy_intp)max_threads;
-        weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
+        if (unshared)
+            weight_dim[1] = (npy_intp)(M_ * N_ * K_ * numgroups);            
+        else
+            weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
                                   weight_dim, PyArray_TYPE(weight), 0);
@@ -341,26 +398,46 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            for(int g = 0; g < numgroups; ++g){
+            // Second, gemm
-                // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
-                // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
-                // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
-                // is faster than setting weight to all zeros before the loop.)
+            if (unshared) {
-                %(gemm)s(&Trans, &NTrans,
+                for (int g = 0; g < numgroups; ++g) {
-                       &K_, &M_, &N_,
+                    for (int reg = 0; reg < N_; ++reg) {
-                       &one,
+                        %(gemm)s(&Trans, &NTrans,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                               &K_, &M_, &one_int,
-                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
+                               &one,
-                       (n == 0) ? &zero : &one,
+                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
-                       (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + 
+                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
-                       tid * weight_dim[1], &K_);
+                               (n == 0) ? &zero : &one,
+                               (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + reg * K_ +
+                               tid * weight_dim[1], &ldw);
+                    }
+                }
+            }
+            else {
+                for(int g = 0; g < numgroups; ++g){
+                    // Second, gemm
+                    // Note that we accumulate into weight. We do so by setting beta = 0
+                    // for the first iteration and beta = 1 for subsequent ones. (This
+                    // is faster than setting weight to all zeros before the loop.)
+                    %(gemm)s(&Trans, &NTrans,
+                           &K_, &M_, &N_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
+                           (n == 0) ? &zero : &one,
+                           (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + 
+                           tid * weight_dim[1], &K_);
+                }
            }
        }
        // Restore to previous blas threads
        %(blas_set_num_threads)s(blas_threads_saved);
        //aggregate weights
-        memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
+        memset((%(float_type)s*)PyArray_DATA(weight), 0, weight_dim[1]*sizeof(%(float_type)s));
        /*
         * Put index "j" into outer loop to get the
         * correct result when openmp is used.
@@ -416,15 +493,29 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        %(omp_flags)s
        for (int n = 0; n < batchSize; ++n) {
            int tid = %(omp_get_thread_num)s;
-            for ( int g = 0;g < numgroups; ++g){
+            if (unshared) {
-                // gemm into columns
+                for (int g = 0; g < numgroups; ++g){
-                %(gemm)s(&NTrans, &Trans,
+                    for (int reg = 0; reg < N_; ++reg){
-                       &N_, &K_, &M_,
+                        %(gemm)s(&NTrans, &Trans,
-                       &one,
+                               &one_int, &K_, &M_,
-                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
+                               &one,
-                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
-                       &zero,
+                               (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
+                               &zero,
+                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_);
+                    }
+                }
+            }
+            else {
+                for (int g = 0; g < numgroups; ++g) {
+                    %(gemm)s(&NTrans, &Trans,
+                           &N_, &K_, &M_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                           &zero,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
+                }
            }
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
@@ -474,4 +565,3 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
 }
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -8,7 +8,7 @@ import theano
 from theano import Apply
 from theano import gof
 from theano.gof import ParamsType, EnumList
-from theano.scalar import int64
+from theano.scalar import int64, int8
 from theano.tensor import as_tensor_variable, TensorType
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor import blas_headers
@@ -42,9 +42,11 @@ class BaseCorrMM(gof.OpenMPOp):
        Perform dilated correlation (default: (1,1))
    num_groups
        Perform grouped convolutions (default: 1)
+    unshared
+        Perform unshared correlation (default: False)
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
    _direction = None
@@ -54,10 +56,10 @@ class BaseCorrMM(gof.OpenMPOp):
                             dH=int64, dW=int64,
                             dilH=int64, dilW=int64,
                             padH=int64, padW=int64,
-                             num_groups=int64)
+                             num_groups=int64, unshared=int8)
    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1, openmp=None):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None):
        super(BaseCorrMM, self).__init__(openmp=openmp)
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
@@ -85,6 +87,7 @@ class BaseCorrMM(gof.OpenMPOp):
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
+        self.unshared = unshared
        if not theano.config.blas.ldflags:
            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
@@ -130,12 +133,13 @@ class BaseCorrMM(gof.OpenMPOp):
    padW = property(lambda self: self.pad[1])
    def __str__(self):
-        return '%s{%s, %s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
-            str(self.num_groups))
+            str(self.num_groups),
+            str(self.unshared))
    @staticmethod
    def as_common_dtype(in1, in2):
@@ -179,7 +183,7 @@ class BaseCorrMM(gof.OpenMPOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (7, self.openmp, blas_header_version())
+        return (8, self.openmp, blas_header_version())
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -189,12 +193,14 @@ class BaseCorrMM(gof.OpenMPOp):
        assert dtype in ('float32', 'float64')
        if dtype == 'float32':
            sub['gemm'] = 'sgemm_'
+            sub['gemv'] = 'sgemv_'
            sub['float_type'] = 'npy_float'
            sub['float_typenum'] = 'NPY_FLOAT'
            sub['n_bytes'] = 4
            sub['c_float_type'] = 'float'
        else:
            sub['gemm'] = 'dgemm_'
+            sub['gemv'] = 'dgemv_'
            sub['float_type'] = 'npy_double'
            sub['float_typenum'] = 'NPY_DOUBLE'
            sub['n_bytes'] = 8
@@ -287,6 +293,7 @@ class BaseCorrMM(gof.OpenMPOp):
    int padH = %(params)s->padH;
    int padW = %(params)s->padW;
    int numgroups = %(params)s->num_groups;
+    int unshared = %(params)s->unshared;
    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
@@ -310,13 +317,17 @@ class BaseCorrMM(gof.OpenMPOp):
            break;
    }
+    int wdim, odim;
+    wdim = unshared ? 6 : 4;
+    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
-        kH = PyArray_DIMS(weights)[2];
+        kH = PyArray_DIMS(weights)[wdim-2];
-        kW = PyArray_DIMS(weights)[3];
+        kW = PyArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
@@ -370,28 +381,45 @@ class BaseCorrMM(gof.OpenMPOp):
    }
    // Infer output shape
-    npy_intp out_dim[4];
+    npy_intp out_dim[6];
+    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
-            PyErr_Format(PyExc_ValueError,
+            if (unshared) {
-                         "CorrMM: impossible output shape\\n"
+                PyErr_Format(PyExc_ValueError,
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "CorrMM: impossible output shape\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)out_dim[3]);
+                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3]);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3]);
+            }
            %(fail)s
        }
        break;
@@ -399,46 +427,90 @@ class BaseCorrMM(gof.OpenMPOp):
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
-        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
+        if (unshared){
-        out_dim[2] = (npy_intp)kH;  // already inferred further above
+            odim = 6;
-        out_dim[3] = (npy_intp)kW;  // how convenient
+            out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
-        {
+        }
-            PyErr_Format(PyExc_ValueError,
+        out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
-                         "CorrMM backprop wrt. weights: impossible output shape\\n"
+        out_dim[wdim-2] = (npy_intp)kH;  // already inferred further above
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+        out_dim[wdim-1] = (npy_intp)kW;  // how convenient
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+        if (unshared) {
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                    || out_dim[4] <= 0 || out_dim[5] <= 0){
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                PyErr_Format(PyExc_ValueError,
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             "CorrMM backprop wrt. weights: impossible output shape\\n"
-                         (long int)out_dim[3],
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+            }
            %(fail)s
        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
-        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW);
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+        if (unshared) {
-        {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-            PyErr_Format(PyExc_ValueError,
+            {
-                         "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                PyErr_Format(PyExc_ValueError,
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         (long int)out_dim[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)out_dim[3],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
-            %(fail)s
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    default:
@@ -448,13 +520,19 @@ class BaseCorrMM(gof.OpenMPOp):
    // Prepare output array
    int typenum;
-    if ( !(*out
+    int failure;
-           && PyArray_NDIM(*out)==4
+    failure = !(*out
+           && PyArray_NDIM(*out)==odim
           && PyArray_IS_C_CONTIGUOUS(*out)
           && PyArray_DIMS(*out)[0]==out_dim[0]
           && PyArray_DIMS(*out)[1]==out_dim[1]
           && PyArray_DIMS(*out)[2]==out_dim[2]
-           && PyArray_DIMS(*out)[3]==out_dim[3]))
+           && PyArray_DIMS(*out)[3]==out_dim[3]);
+    if (odim == 6){
+        failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
+                && PyArray_DIMS(*out)[5]==out_dim[5]);
+    }
+    if ( failure )
    {
        Py_XDECREF(*out);
        if (direction != 1) {
@@ -464,21 +542,29 @@ class BaseCorrMM(gof.OpenMPOp):
          typenum = PyArray_TYPE(bottom);
        }
        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-        *out = (PyArrayObject*)PyArray_ZEROS(4,
+        *out = (PyArrayObject*)PyArray_ZEROS(odim,
                                          out_dim,
                                          typenum,
                                          0);
        if (NULL == *out)
        {
-            PyErr_Format(PyExc_RuntimeError,
+            if (odim == 4) {
-                    "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+                PyErr_Format(PyExc_RuntimeError,
-                    (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
+                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
+            }
+            if (odim == 6) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
+                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
+                        (long long)out_dim[4], (long long)out_dim[5]);
+            }
            %(fail)s
        }
    }
    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups );
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -514,6 +600,9 @@ class CorrMM(BaseCorrMM):
        The filter dilation operation applied to each input image.
        Should be a tuple with 2 elements.
        Set to `(1, 1)` to disable filter dilation.
+    unshared:
+        Boolean value. If true, then a different kernel will be applied to
+        each region of the input image.
    """
@@ -525,8 +614,12 @@ class CorrMM(BaseCorrMM):
        img, kern = self.as_common_dtype(img, kern)
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
+        if self.unshared is True:
-            raise TypeError('kern must be 4D tensor')
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
@@ -555,13 +648,15 @@ class CorrMM(BaseCorrMM):
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
                                     self.filter_dilation,
-                                     self.num_groups)(weights, top,
+                                     self.num_groups,
-                                                      bottom.shape[-2:])
+                                     self.unshared)(weights, top,
+                                                    bottom.shape[-2:])
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
                                       self.filter_dilation,
-                                       self.num_groups)(bottom, top,
+                                       self.num_groups,
-                                                        weights.shape[-2:])
+                                       self.unshared)(bottom, top,
+                                                      weights.shape[-2:])
        return d_bottom, d_weights
@@ -595,8 +690,12 @@ class CorrMM_gradWeights(BaseCorrMM):
        else:
            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]
-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+        if self.unshared is True:
-                         False, False]
+            broadcastable = [topgrad.type.broadcastable[0], False, False,
+                             img.type.broadcastable[1], False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[0], img.type.broadcastable[1],
+                             False, False]
        dtype = img.type.dtype
        return Apply(self, [img, topgrad] + height_width,
                     [TensorType(dtype, broadcastable)()])
@@ -633,7 +732,10 @@ class CorrMM_gradWeights(BaseCorrMM):
            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
        else:
            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
-        return [(nkern, ssize, kH, kW)]
+        if self.unshared is True:
+            return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
+        else:
+            return [(nkern, ssize, kH, kW)]
    def c_code(self, node, nodename, inp, out_, sub):
        bottom, top = inp[:2]
@@ -649,12 +751,14 @@ class CorrMM_gradWeights(BaseCorrMM):
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
                                     self.filter_dilation,
-                                     self.num_groups)(weights, top,
+                                     self.num_groups,
-                                                      bottom.shape[-2:])
+                                     self.unshared)(weights, top,
+                                                    bottom.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
                       self.filter_dilation,
-                       self.num_groups)(bottom, weights)
+                       self.num_groups,
+                       self.unshared)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
                          if len(inp) == 4 else ())
        return (d_bottom, d_top) + d_height_width
@@ -684,8 +788,12 @@ class CorrMM_gradInputs(BaseCorrMM):
        kern = as_tensor_variable(kern)
        topgrad = as_tensor_variable(topgrad)
        kern, topgrad = self.as_common_dtype(kern, topgrad)
-        if kern.type.ndim != 4:
+        if self.unshared is True:
-            raise TypeError('kern must be 4D tensor')
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
        if shape is None:
@@ -700,7 +808,7 @@ class CorrMM_gradInputs(BaseCorrMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
                             False, False]
        dtype = kern.type.dtype
        return Apply(self, [kern, topgrad] + height_width,
@@ -719,7 +827,7 @@ class CorrMM_gradInputs(BaseCorrMM):
        dH, dW = self.subsample
        kshp = input_shape[0]
        topshp = input_shape[1]
-        ssize, kshp = kshp[1], list(kshp[2:])
+        ssize, kshp = kshp[-3], list(kshp[-2:])
        ssize = ssize * self.num_groups
        bsize, topshp = topshp[0], list(topshp[2:])
        height_width = node.inputs[-2:]
@@ -762,13 +870,15 @@ class CorrMM_gradInputs(BaseCorrMM):
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
                                       self.filter_dilation,
-                                       self.num_groups)(bottom,
+                                       self.num_groups,
-                                                        top,
+                                       self.unshared)(bottom,
-                                                        weights.shape[-2:])
+                                                      top,
+                                                      weights.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
                       self.filter_dilation,
-                       self.num_groups)(bottom, weights)
+                       self.num_groups,
+                       self.unshared)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) *
                          2 if len(inp) == 4 else ())
        return (d_weights, d_top) + d_height_width

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM(border_mode=node.op.border_mode,
                  subsample=node.op.subsample,
                  filter_dilation=node.op.filter_dilation,
-                  num_groups=node.op.num_groups)(img, kern)
+                  num_groups=node.op.num_groups,
+                  unshared=node.op.unshared)(img, kern)
    copy_stack_trace(node.outputs[0], rval)
    return [rval]
@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
                              subsample=node.op.subsample,
                              filter_dilation=node.op.filter_dilation,
-                              num_groups=node.op.num_groups)(img, topgrad, shape)
+                              num_groups=node.op.num_groups,
+                              unshared=node.op.unshared)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (rval.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        rval = rval[flip]
    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    copy_stack_trace(node.outputs[0], rval)
@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):
    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
                             subsample=node.op.subsample,
                             filter_dilation=node.op.filter_dilation,
-                             num_groups=node.op.num_groups)(kern, topgrad,
+                             num_groups=node.op.num_groups,
-                                                            shape)
+                             unshared=node.op.unshared)(kern, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)
    return [rval]
@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    rval = conv2d(img, kern,
@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    if node.op.border_mode == 'valid' and \
@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None
    # Conv 3d implementation, needed when subsample > 2

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -1744,3 +1744,146 @@ class Separable_conv(unittest.TestCase):
        fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
        top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
        utt.assert_allclose(top, precomp_output)
+class TestUnsharedConv(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    def setUp(self):
+        self.mode = theano.compile.mode.Mode(optimizer='None')
+        self.img_shape = [(2, 1, 4, 4), (1, 2, 4, 2), (1, 3, 5, 3), (1, 4, 4, 4)]
+        self.kern_shape = [(2, 2, 2, 1, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
+        self.topgrad_shape = [(2, 2, 2, 2), (1, 2, 4, 2), (1, 3, 2, 1), (1, 4, 3, 3)]
+        self.border_mode = ['valid', 'full', 'valid', 'full']
+        self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
+        self.filter_dilation = (1, 1)
+        self.num_groups = [1, 1, 3, 2]
+        self.verify_flags = np.random.choice([True, False], 4, [1.0, 0.0])
+        self.ref_mode = 'FAST_RUN'
+        if theano.config.cxx == "":
+            raise SkipTest("CorrMM needs cxx")
+    def test_fwd(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = tensor6('kern')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+        for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
+                                                          self.subsample, self.num_groups, self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                           filter_dilation=self.filter_dilation,
+                                           num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
+            unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, kern)
+            single_kshp = kshp[:1] + kshp[3:]
+            ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                      filter_dilation=self.filter_dilation,
+                                      num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
+            ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
+            for i in range(0, kshp[1]):
+                for j in range(0, kshp[2]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    ref_val = ref_func(img, single_kern)
+                    utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
+            if verify:
+                utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
+    def test_gradweight(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
+            unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, top)
+            single_kshp = kshp[:1] + kshp[3:]
+            ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
+            ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output = ref_func(img, top_single)
+                    utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
+            def conv_gradweight(inputs_val, output_val):
+                return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
+            if verify:
+                utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
+    def test_gradinput(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        kern_sym = tensor6('kern')
+        top_sym = theano.tensor.tensor4('top')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            single_kshp = kshp[:1] + kshp[3:]
+            single_kern = np.random.random(single_kshp).astype(theano.config.floatX)
+            kern = single_kern.reshape((kshp[:1] + (1, 1) + kshp[3:]))
+            kern = np.tile(kern, (1, kshp[1], kshp[2], 1, 1, 1))
+            top = np.random.random(topshp).astype(theano.config.floatX)
+            unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(kern, top)
+            ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
+            ref_output = ref_func(single_kern, top)
+            utt.assert_allclose(ref_output, unshared_output)
+            def conv_gradinputs(filters_val, output_val):
+                return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
+            if verify:
+                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,7 +10,7 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv
 class TestCorr2D(utt.InferShapeTester):
@@ -452,6 +452,21 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
        utt.assert_allclose(gconv_output, conv_output)
+class TestUnsharedCorr2D(TestUnsharedConv):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d = corr.CorrMM
+    conv2d_gradw = corr.CorrMM_gradWeights
+    conv2d_gradi = corr.CorrMM_gradInputs
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
 if __name__ == '__main__':
    t = TestCorr2D('setUp')