Merge pull request #6286 from vikramnitin9/grouped_unshared

Implement Unshared Convolution

Merge pull request #6286 from vikramnitin9/grouped_unshared
9592125c · Frédéric Bastien · GitHub · c32b0db8 · 121f96d6 · 9592125c
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -458,13 +458,15 @@ class BaseGpuCorrMM(CGpuKernelBase):
    num_groups :
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately (default : 1).
+    unshared
+        Perform unshared correlation (default: False)
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')
    _f16_ok = True

    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
        if isinstance(border_mode, tuple):
@@ -487,6 +489,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
            raise ValueError("Number of groups should be greater than 0")
        self.num_groups = num_groups
        CGpuKernelBase.__init__(self, ['c_code/corr_gemm.c'])
+        self.unshared = unshared

    @property
    def pad(self):
@@ -495,12 +498,13 @@ class BaseGpuCorrMM(CGpuKernelBase):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
-            str(self.num_groups))
+            str(self.num_groups),
+            str(self.unshared))

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -533,7 +537,7 @@ class BaseGpuCorrMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (10,)
+        return (11,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -581,6 +585,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        dH, dW = self.subsample
        dilH, dilW = self.filter_dilation
        numgroups = self.num_groups
+        unshared = int(self.unshared)
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -633,19 +638,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
    int padH = %(padH)s;
    int padW = %(padW)s;
    int numgroups = %(numgroups)s;
+    int unshared = %(unshared)s;

    PyGpuArrayObject * bottom = %(bottom)s;
    PyGpuArrayObject * weights = %(weights)s;
    PyGpuArrayObject * top = %(top)s;
    PyGpuArrayObject * out2 = NULL;

+    int wdim, odim;
+    wdim = unshared ? 6 : 4;
+    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
+
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    size_t kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
-        kH = PyGpuArray_DIMS(weights)[2];
-        kW = PyGpuArray_DIMS(weights)[3];
+        kH = PyGpuArray_DIMS(weights)[wdim-2];
+        kW = PyGpuArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
@@ -699,8 +709,10 @@ class BaseGpuCorrMM(CGpuKernelBase):

    // Infer output shape and type
    // The inferred shape can be negative.
-    long long out_dim[4];
-    size_t out_dim_size[4];
+    long long out_dim[6];
+    size_t out_dim_size[6];
+    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
+    out_dim_size[4] = out_dim_size[5] = 0; //Same
    int out_typecode;
    PyGpuContextObject *out_context;
    switch(direction) {
@@ -709,71 +721,131 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = PyGpuArray_DIMS(bottom)[0];
        out_dim[1] = PyGpuArray_DIMS(weights)[0];
-        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1;
-        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1;
+        out_dim[2] = (PyGpuArray_DIMS(bottom)[2] + 2*padH - ((PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1;
+        out_dim[3] = (PyGpuArray_DIMS(bottom)[3] + 2*padW - ((PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1;
        out_typecode = bottom->ga.typecode;
        out_context = bottom->context;
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorrMM: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
-            %(fail)s
+            if (unshared) {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                %(fail)s
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+                %(fail)s
+            }
        }
        break;
    case 1:  // backprop wrt. weights
-        // output is weights: (num_filters, num_channels, height, width)
+        // output is weights: (num_filters, num_channels, height, width) or
+        // (num_filters, top_height, top_width, num_channels, height, width) -> for unshared
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
-        out_dim[2] = kH;  // already inferred further above
-        out_dim[3] = kW;  // how convenient
+        if (unshared){
+            odim = 6;
+            out_dim[1] = PyGpuArray_DIMS(top)[2];
+            out_dim[2] = PyGpuArray_DIMS(top)[3];
+        }
+        out_dim[wdim-3] = PyGpuArray_DIMS(bottom)[1] / numgroups;
+        out_dim[wdim-2] = kH;  // already inferred further above
+        out_dim[wdim-1] = kW;  // how convenient
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
-                         PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-            %(fail)s
+        if (unshared) {
+            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
+                    || out_dim[4] <= 0 || out_dim[5] <= 0){
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             out_dim[4], out_dim[5],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+             if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             PyGpuArray_DIMS(bottom)[0], PyGpuArray_DIMS(bottom)[1],
+                             PyGpuArray_DIMS(bottom)[2], PyGpuArray_DIMS(bottom)[3],
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
-        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
-        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
+        out_dim[1] = PyGpuArray_DIMS(weights)[wdim-3] * numgroups;
+        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH;
+        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
        out_context = top->context;
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         out_dim[0], out_dim[1], out_dim[2], out_dim[3],
-                         PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
-                         PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
-                         PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                         PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
-            %(fail)s
+        if (unshared) {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weight shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(weights)[4], PyGpuArray_DIMS(weights)[5],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "GpuCorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weight shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             out_dim[0], out_dim[1], out_dim[2], out_dim[3],
+                             PyGpuArray_DIMS(weights)[0], PyGpuArray_DIMS(weights)[1],
+                             PyGpuArray_DIMS(weights)[2], PyGpuArray_DIMS(weights)[3],
+                             PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                             PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    default:
@@ -786,12 +858,24 @@ class BaseGpuCorrMM(CGpuKernelBase):
    out_dim_size[2] = (size_t)out_dim[2];
    out_dim_size[3] = (size_t)out_dim[3];

+    if (odim == 6) {
+        out_dim_size[4] = (size_t)out_dim[4];
+        out_dim_size[5] = (size_t)out_dim[5];
+    }
+
    // Prepare output array
-    if (theano_prep_output(&%(out)s, 4, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
+    if (theano_prep_output(&%(out)s, odim, out_dim_size, out_typecode, GA_C_ORDER, out_context) != 0)
    {
-        PyErr_Format(PyExc_RuntimeError,
-                "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
-                out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+        if (odim == 4) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3]);
+        }
+        if (odim == 6) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "BaseGpuCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
+                    out_dim[0], out_dim[1], out_dim[2], out_dim[3], out_dim[4], out_dim[5]);
+        }
        %(fail)s
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga)) {
@@ -800,7 +884,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }

    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -840,6 +924,8 @@ class GpuCorrMM(BaseGpuCorrMM):
        divided into.
        should be an int
        set to 1 to disable grouped convolution
+    unshared
+        Perform unshared correlation (default: False)

    Notes
    -----
@@ -859,9 +945,9 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False):
        super(GpuCorrMM, self).__init__(border_mode, subsample,
-                                        filter_dilation, num_groups)
+                                        filter_dilation, num_groups, unshared)

    def make_node(self, img, kern):
        ctx_name = infer_context_name(img, kern)
@@ -869,8 +955,12 @@ class GpuCorrMM(BaseGpuCorrMM):
        kern = as_gpuarray_variable(kern, ctx_name)
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+        if self.unshared:
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')

        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
@@ -891,12 +981,14 @@ class GpuCorrMM(BaseGpuCorrMM):
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
                                        self.filter_dilation,
-                                        self.num_groups)(
+                                        self.num_groups,
+                                        self.unshared)(
            weights, top, bottom.shape[-2:])
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
                                          self.filter_dilation,
-                                          self.num_groups)(
+                                          self.num_groups,
+                                          self.unshared)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights

@@ -915,10 +1007,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
                                                    subsample,
-                                                    filter_dilation, num_groups)
+                                                    filter_dilation, num_groups,
+                                                    unshared)

    def make_node(self, img, topgrad, shape=None):
        ctx_name = infer_context_name(img, topgrad)
@@ -938,8 +1032,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0

-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
-                         False, False]
+        if self.unshared:
+            broadcastable = [topgrad.type.broadcastable[0], False, False,
+                             img.type.broadcastable[1], False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
+                             False, False]
        return Apply(self, [img, topgrad] + height_width, [GpuArrayType(dtype=img.dtype,
                                                                        context_name=ctx_name,
                                                                        broadcastable=broadcastable)()])
@@ -958,11 +1056,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
                                        self.filter_dilation,
-                                        self.num_groups)(weights,
-                                                         top,
-                                                         bottom.shape[-2:])
+                                        self.num_groups,
+                                        self.unshared)(weights,
+                                                       top,
+                                                       bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation, self.num_groups, self.unshared)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -989,16 +1088,22 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
-                                                   filter_dilation, num_groups)
+                                                   filter_dilation, num_groups,
+                                                   unshared)

    def make_node(self, kern, topgrad, shape=None):
        ctx_name = infer_context_name(kern, topgrad)
        kern = as_gpuarray_variable(kern, ctx_name)
        topgrad = as_gpuarray_variable(topgrad, ctx_name)
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+        if self.unshared:
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
        if shape is None:
@@ -1014,7 +1119,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
                             False, False]
        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
                                                                         context_name=ctx_name,
@@ -1034,13 +1139,15 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
                                          self.filter_dilation,
-                                          self.num_groups)(bottom,
-                                                           top,
-                                                           weights.shape[-2:])
+                                          self.num_groups,
+                                          self.unshared)(bottom,
+                                                         top,
+                                                         weights.shape[-2:])
        d_top = GpuCorrMM(self.border_mode,
                          self.subsample,
                          self.filter_dilation,
-                          self.num_groups)(bottom, weights)
+                          self.num_groups,
+                          self.unshared)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1682,7 +1789,7 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-4],
                             False, False, False]
        return Apply(self, [kern, topgrad] + height_width_depth,
                     [GpuArrayType(dtype=topgrad.dtype,

--- a/theano/gpuarray/c_code/corr_gemm.c
+++ b/theano/gpuarray/c_code/corr_gemm.c
@@ -349,7 +349,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dilW = 1,
                         const size_t padH = 0,
                         const size_t padW = 0,
-                         const size_t numgroups = 1)
+                         const size_t numgroups = 1,
+                         const size_t unshared = 0)
 {
    if (PyGpuArray_NDIM(bottom) != 4)
    {
@@ -368,21 +369,35 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        return NULL;
    }

-    if (PyGpuArray_NDIM(weight) != 4)
+    if (PyGpuArray_NDIM(weight) != (unshared ? 6 : 4))
    {
-        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
+        PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight of %dD", unshared ? 6 : 4);
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM requires weight to be C-contiguous, "
-                "but strides are: %ld %ld %ld %ld\n",
-                PyGpuArray_STRIDES(weight)[0],
-                PyGpuArray_STRIDES(weight)[1],
-                PyGpuArray_STRIDES(weight)[2],
-                PyGpuArray_STRIDES(weight)[3]);
-        return NULL;
+        if (unshared) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM requires weight to be C-contiguous, "
+                    "but strides are: %ld %ld %ld %ld %ld %ld\n",
+                    PyGpuArray_STRIDES(weight)[0],
+                    PyGpuArray_STRIDES(weight)[1],
+                    PyGpuArray_STRIDES(weight)[2],
+                    PyGpuArray_STRIDES(weight)[3],
+                    PyGpuArray_STRIDES(weight)[4],
+                    PyGpuArray_STRIDES(weight)[5]);
+            return NULL;
+        }
+        else {  
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM requires weight to be C-contiguous, "
+                    "but strides are: %ld %ld %ld %ld\n",
+                    PyGpuArray_STRIDES(weight)[0],
+                    PyGpuArray_STRIDES(weight)[1],
+                    PyGpuArray_STRIDES(weight)[2],
+                    PyGpuArray_STRIDES(weight)[3]);
+            return NULL;
+        }
    }

    if (PyGpuArray_NDIM(top) != 4)
@@ -409,10 +424,12 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    // weights: (nFilters, nChannels, rows, columns)
+    // or (nFilters, out_rows, out_columns, nChannels, rows, columns) -> for unshared
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
-    const size_t kH = PyGpuArray_DIMS(weight)[2];
-    const size_t kW = PyGpuArray_DIMS(weight)[3];
-    if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
+
+    const size_t kH = PyGpuArray_DIMS(weight)[unshared ? 4 : 2];
+    const size_t kW = PyGpuArray_DIMS(weight)[unshared ? 5 : 3];
+    if (nChannels != PyGpuArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -435,21 +452,56 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
 #undef _CONV_FLOORDIV
-    if (batchSize != PyGpuArray_DIMS(top)[0] ||
-            nFilters != PyGpuArray_DIMS(top)[1] ||
-            topHeight != PyGpuArray_DIMS(top)[2] ||
-            topWidth != PyGpuArray_DIMS(top)[3]) {
-        PyErr_Format(PyExc_ValueError,
-                "GpuCorrMM shape inconsistency:\n"
-                "  bottom shape: %ld %ld %ld %ld\n"
-                "  weight shape: %ld %ld %ld %ld\n"
-                "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels / numgroups, kH, kW,
-                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
-                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
-                batchSize, nFilters, topHeight, topWidth);
-        return NULL;
+    if (unshared) {
+        if (topHeight != PyGpuArray_DIMS(weight)[1] ||
+                topWidth != PyGpuArray_DIMS(weight)[2]) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM regions in kernel must match output regions:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld %ld %ld"
+                    " (expected %ld %ld %ld %ld %ld %ld)\n"
+                    "  top shape(calculated): %ld %ld %ld %ld\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, PyGpuArray_DIMS(weight)[1],
+                    PyGpuArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+        if (batchSize != PyGpuArray_DIMS(top)[0] ||
+                nFilters != PyGpuArray_DIMS(top)[1] ||
+                topHeight != PyGpuArray_DIMS(top)[2] ||
+                topWidth != PyGpuArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM shape inconsistency:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld %ld %ld\n"
+                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+    }
+    else{
+        if (batchSize != PyGpuArray_DIMS(top)[0] ||
+                nFilters != PyGpuArray_DIMS(top)[1] ||
+                topHeight != PyGpuArray_DIMS(top)[2] ||
+                topWidth != PyGpuArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "GpuCorrMM shape inconsistency:\n"
+                    "  bottom shape: %ld %ld %ld %ld\n"
+                    "  weight shape: %ld %ld %ld %ld\n"
+                    "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, nChannels / numgroups, kH, kW,
+                    PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
+                    PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
    }

    int err = gpublas_setup(bottom->context->ctx);
@@ -512,19 +564,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
            // Second, gemm
-            for (size_t g = 0; g < numgroups; g++){
-                err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                            N_, M_, K_, 1,
-                            &col->ga, g * group_col_stride, N_,
-                            &weight->ga, g * group_weight_stride, K_,
-                            0,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+            if (unshared) {
+              for (size_t g = 0; g < numgroups; ++g) {
+                for (size_t reg = 0; reg < N_; ++reg){
+                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                      1, M_, K_, 1,
+                                      &col->ga, g * group_col_stride + reg, N_,
+                                      &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
+                                      0,
+                                      &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM forward encountered an error running gemm: %d", err);
-                Py_DECREF(col);
-                return NULL;
+            else {
+              for (size_t g = 0; g < numgroups; ++g){
+                  err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                              N_, M_, K_, 1,
+                              &col->ga, g * group_col_stride, N_,
+                              &weight->ga, g * group_weight_stride, K_,
+                              0,
+                              &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
        }
    }
@@ -557,19 +627,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            for(size_t g = 0; g < numgroups; g++){ 
-                err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                            K_, M_, N_, 1,
-                            &col->ga, g * group_col_stride, N_,
-                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                            (n == 0) ? 0 : 1,
-                            &weight->ga, g * group_weight_stride, K_);
+            if (unshared) {
+              for (size_t g = 0; g < numgroups; ++g) {
+                for (size_t reg = 0; reg < N_; ++reg){
+                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+                              K_, M_, 1, 1,
+                              &col->ga, g * group_col_stride + reg, N_,
+                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
+                              (n == 0) ? 0 : 1,
+                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
-                Py_DECREF(col);
-                return NULL;
+            else{
+              for(size_t g = 0; g < numgroups; g++){ 
+                  err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+                              K_, M_, N_, 1,
+                              &col->ga, g * group_col_stride, N_,
+                              &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                              (n == 0) ? 0 : 1,
+                              &weight->ga, g * group_weight_stride, K_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
        }
    }
@@ -590,19 +678,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
-            for(size_t g = 0; g < numgroups; g++){
-              err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                          N_, K_, M_, 1,
-                          &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                          &weight->ga, g * group_weight_stride, K_,
-                          0,
-                          &col->ga, g * group_col_stride, N_);
+            if (unshared) {
+              for (size_t g = 0; g < numgroups; ++g){
+                for (size_t reg = 0; reg < N_; ++reg) {
+                  err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+                              1, K_, M_, 1,
+                              &top->ga, n * batch_top_stride + g * group_top_stride + reg, N_,
+                              &weight->ga, g * group_weight_stride + reg * K_, K_ * N_,
+                              0,
+                              &col->ga, g * group_col_stride + reg, N_);
+                  if (err != GA_NO_ERROR) {
+                      PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
+                      Py_DECREF(col);
+                      return NULL;
+                  }
+                }
+              }
            }
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
-                Py_DECREF(col);
-                return NULL;
+            else {
+              for (size_t g = 0; g < numgroups; ++g){
+                err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+                            N_, K_, M_, 1,
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                            &weight->ga, g * group_weight_stride, K_,
+                            0,
+                            &col->ga, g * group_col_stride, N_);
+                if (err != GA_NO_ERROR) {
+                    PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
+                    Py_DECREF(col);
+                    return NULL;
+                }
+              }
            }
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3035,6 +3035,9 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
        return None

+    if op.unshared:
+        return None
+
    inp1 = inputs[0]
    inp2 = inputs[1]

@@ -3129,6 +3132,8 @@ def local_abstractconv_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d):
@@ -3143,6 +3148,8 @@ def local_abstractconv_cudnn_alt(node):

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
+    if node.op.unshared:
+        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

@@ -3349,6 +3356,8 @@ def local_abstractconv_gw_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradWeights):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradWeights):
@@ -3360,6 +3369,8 @@ def local_abstractconv_gi_cudnn(node):
    ctx = infer_context_name(*node.inputs)
    if not isinstance(node.inputs[0].type, GpuArrayType):
        return
+    if node.op.unshared:
+        return None
    if isinstance(node.op, AbstractConv2d_gradInputs):
        return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
    elif isinstance(node.op, AbstractConv3d_gradInputs):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1595,12 +1595,17 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
+    flip = (slice(None),) * (kern.ndim - 2) + \
+        (slice(None, None, -1),) * 2
+    kern_axes = (1, 0) + tuple(i for i in range(2, kern.ndim))
+    if ((border_mode == 'full') and (subsample == (1, 1)) and num_groups == 1 and not unshared):
        if not node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # need to dimshuffle the kernel for full convolution
-        kern = kern.dimshuffle(1, 0, 2, 3)
+        kern = kern.dimshuffle(kern_axes)
        # call GpuCorrMM_gradInputs
        rval = GpuCorrMM_gradInputs('valid',
                                    subsample,
@@ -1609,13 +1614,14 @@ def local_abstractconv_gemm(node):
    else:
        # need to flip the kernel if necessary
        if node.op.filter_flip:
-            kern = kern[:, :, ::-1, ::-1]
+            kern = kern[flip]
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation,
-                         node.op.num_groups)(gpu_contiguous(img),
-                                             gpu_contiguous(kern))
+                         num_groups,
+                         unshared)(gpu_contiguous(img),
+                                   gpu_contiguous(kern))

        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1628,11 +1634,12 @@ def local_abstractconv_gemm(node):
                (node.op.kshp is not None) and
                (None not in node.op.kshp) and
                border_mode != "half" and
-                node.op.num_groups == 1):
+                num_groups == 1 and
+                not unshared):
            # we know the kernel and output size
-            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod1 = node.op.kshp[0] * node.op.kshp[-3]
            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                     (node.op.imshp[-1] - node.op.kshp[-3] + 1))
            if (None not in node.op.imshp[:1]):
                # we also know batchsize and input channels
                prod1 *= node.op.imshp[0]
@@ -1666,13 +1673,19 @@ def local_abstractconv_gemm_def(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
+    num_groups = node.op.num_groups
+    unshared = node.op.unshared
+
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = GpuCorrMM(border_mode,
                     subsample,
                     filter_dilation,
-                     node.op.num_groups)(gpu_contiguous(img),
-                                         gpu_contiguous(kern))
+                     num_groups,
+                     unshared)(gpu_contiguous(img),
+                               gpu_contiguous(kern))
    return [rval]


@@ -1690,8 +1703,9 @@ def local_abstractconv_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'full' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

@@ -1702,7 +1716,7 @@ def local_abstractconv_gemm_alt(node):
            gpu_contiguous(kern), gpu_contiguous(img))

    elif (border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-          num_groups == 1):
+          num_groups == 1 and not unshared):
        if node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

@@ -1896,10 +1910,13 @@ def local_abstractconv_gradweights_gemm(node):
    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
                                 filter_dilation=node.op.filter_dilation,
-                                 num_groups=node.op.num_groups)(
+                                 num_groups=node.op.num_groups,
+                                 unshared=node.op.unshared)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    flip = (slice(None),) * (rval.ndim - 2) + \
+        (slice(None, None, -1),) * 2
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        rval = rval[flip]
    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    rval = as_gpuarray_variable(rval, context_name=ctx)
    return [rval]
@@ -1918,9 +1935,10 @@ def local_abstractconv_gemm_gradweights_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

    if(border_mode == 'valid' and subsample == (1, 1) and filter_dilation == (1, 1) and
-       num_groups == 1):
+       num_groups == 1 and not unshared):
        rval = GpuCorrMM(border_mode,
                         subsample,
                         filter_dilation)(
@@ -2001,12 +2019,15 @@ def local_abstractconv_gradinputs_gemm(node):
        return None

    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]

    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
                                filter_dilation=node.op.filter_dilation,
-                                num_groups=node.op.num_groups)(
+                                num_groups=node.op.num_groups,
+                                unshared=node.op.unshared)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]

@@ -2023,8 +2044,9 @@ def local_abstractconv_gradinputs_gemm_alt(node):
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
+    unshared = node.op.unshared

-    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1:
+    if border_mode == 'valid' and subsample == (1, 1) and num_groups == 1 and not unshared:
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]

@@ -2117,8 +2139,9 @@ class ConvMetaOptimizer(LocalMetaOptimizer):
                                         node.op.border_mode,
                                         node.op.subsample,
                                         node.op.filter_dilation)
+            convdim = img.ndim - 2

-            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[2:])
+            result[kshape] = theano.tensor.as_tensor_variable(node.op.kshp[-convdim:])

            for(var, shape) in zip((img, top), (node.op.imshp, tshp)):
                result[var] = theano.shared(np.random.random(shape).astype(var.dtype),

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -11,7 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput
 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from .config import mode_with_gpu, mode_without_gpu, ref_cast
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv


 class TestCorrMM(unittest.TestCase):
@@ -20,9 +20,13 @@ class TestCorrMM(unittest.TestCase):
                       border_mode='valid',
                       filter_dilation=(1, 1),
                       subsample=(1, 1),
+                       unshared=False,
                       verify_grad=False):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
-        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
+        if unshared:
+            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
+        else:
+            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]

        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)
@@ -32,13 +36,15 @@ class TestCorrMM(unittest.TestCase):

        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(ref_cast(inputs),
-                                               ref_cast(filters))
+                          subsample=subsample,
+                          unshared=unshared)(ref_cast(inputs),
+                                             ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)

        conv = GpuCorrMM(border_mode=border_mode,
                         filter_dilation=filter_dilation,
-                         subsample=subsample)(inputs, filters)
+                         subsample=subsample,
+                         unshared=unshared)(inputs, filters)
        f = theano.function([], conv, mode=mode_with_gpu)

        res_ref = f_ref()
@@ -48,7 +54,8 @@ class TestCorrMM(unittest.TestCase):
        if verify_grad:
            utt.verify_grad(GpuCorrMM(border_mode=border_mode,
                                      filter_dilation=filter_dilation,
-                                      subsample=subsample),
+                                      subsample=subsample,
+                                      unshared=unshared),
                            [inputs_val, filters_val], mode=mode_with_gpu)

    def test_valid(self):
@@ -57,12 +64,6 @@ class TestCorrMM(unittest.TestCase):
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(2, 2))
-        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
-                            filters_shape=(10, 6, 12, 1),
-                            subsample=(3, 3))
        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
                            filters_shape=(10, 6, 12, 1),
                            subsample=(3, 3))
@@ -117,6 +118,41 @@ class TestCorrMM(unittest.TestCase):
                                    border_mode=border_mode,
                                    verify_grad=True)

+    def test_unshared(self):
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 8, 1, 6, 12, 1),
+                            subsample=(2, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 3), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 5, 1, 6, 12, 1),
+                            subsample=(3, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            subsample=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode='valid', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 13, 6, 12, 1),
+                            border_mode='half', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 25, 23, 6, 12, 1),
+                            border_mode='full', unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 15, 1, 6, 12, 1),
+                            border_mode=(0, 0), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 17, 5, 6, 12, 1),
+                            border_mode=(1, 2), unshared=True)
+        self.run_conv_valid(inputs_shape=(16, 20, 12, 1),
+                            filters_shape=(10, 21, 5, 6, 12, 1),
+                            border_mode=(3, 2), unshared=True)
+
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
@@ -227,3 +263,12 @@ class TestGroupGpuCorr2d(Grouped_conv_noOptim):
    conv_op = GpuCorrMM
    conv_gradw_op = GpuCorrMM_gradWeights
    conv_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+
+
+class TestUnsharedGpuCorr2d(TestUnsharedConv):
+    mode = mode_with_gpu
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -37,7 +37,7 @@ from .abstract_conv import separable_conv2d

 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
+           image_shape=None, filter_dilation=(1, 1), num_groups=1, unshared=False, **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -51,18 +51,22 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        (batch size, input channels, input rows, input columns).
        See the optional parameter ``input_shape``.

-    filters: symbolic 4D tensor
+    filters: symbolic 4D or 6D tensor
        Set of filters used in CNN layer of shape
-        (output channels, input channels, filter rows, filter columns).
+        (output channels, input channels, filter rows, filter columns)
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
        See the optional parameter ``filter_shape``.

-    input_shape: None, tuple/list of len 4 of int or Constant variable
+    input_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the input parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
        element is not known at compile time.

-    filter_shape: None, tuple/list of len 4 of int or Constant variable
+    filter_shape: None, tuple/list of len 4 or 6 of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -105,6 +109,11 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately

+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
+
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.

@@ -154,12 +163,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,

    return abstract_conv2d(input, filters, input_shape, filter_shape,
                           border_mode, subsample, filter_flip,
-                           filter_dilation, num_groups)
+                           filter_dilation, num_groups, unshared)


 def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                     border_mode='valid', input_dilation=(1, 1),
-                     filter_flip=True, filter_dilation=(1, 1), num_groups=1):
+                     filter_flip=True, filter_dilation=(1, 1), num_groups=1, unshared=False):
    """
    This function will build the symbolic graph for applying a transposed
    convolution over a mini-batch of a stack of 2D inputs with a set of 2D
@@ -215,6 +224,12 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately

+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
+        Grouped unshared convolution is supported.
+
    Returns
    -------
    Symbolic 4D tensor
@@ -242,4 +257,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                                  subsample=input_dilation,
                                  filter_flip=filter_flip,
                                  filter_dilation=filter_dilation,
-                                  num_groups=num_groups)
+                                  num_groups=num_groups,
+                                  unshared=unshared)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -44,9 +44,14 @@ def get_conv_output_shape(image_shape, kernel_shape,
        to: batch size, number of input channels, height and width (and
        possibly depth) of the image. None where undefined.
    kernel_shape: tuple of int (symbolic or numeric) corresponding to the
-        kernel shape. Its four (or five) elements must correspond respectively
-        to: number of output channels, number of input channels, height and
-        width (and possibly depth) of the kernel. None where undefined.
+        kernel shape. For a normal convolution, its four (for 2D convolution)
+        or five (for 3D convolution) elements must correspond respectively to :
+        number of output channels, number of input channels, height and width
+        (and possibly depth) of the kernel.
+        For an unshared 2D convolution, its six channels must correspond to :
+        number of output channels, height and width of the output, number of
+        input channels, height and width of the kernel.
+        None where undefined.
    border_mode: string, int (symbolic or numeric) or tuple of int (symbolic
        or numeric). If it is a string, it must be 'valid', 'half' or 'full'.
        If it is a tuple, its two (or three) elements respectively correspond
@@ -56,6 +61,8 @@ def get_conv_output_shape(image_shape, kernel_shape,
        possibly depth) axis.
    filter_dilation: tuple of int (symbolic or numeric). Its two or three
        elements correspond respectively to the dilation on height and width axis.
+    Note - The shape of the convolution output does not depend on the 'unshared'
+        or the 'num_groups' parameters.

    Returns
    -------
@@ -65,7 +72,10 @@ def get_conv_output_shape(image_shape, kernel_shape,

    """
    bsize, imshp = image_shape[0], image_shape[2:]
-    nkern, kshp = kernel_shape[0], kernel_shape[2:]
+
+    convdim = len(image_shape) - 2
+    nkern, kshp = kernel_shape[0], kernel_shape[-convdim:]
+
    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')

@@ -139,7 +149,7 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
 def get_conv_gradweights_shape(image_shape, top_shape,
                               border_mode, subsample,
                               filter_dilation=None,
-                               num_groups=1):
+                               num_groups=1, unshared=False):
    """
    This function tries to compute the kernel shape of convolution gradWeights.

@@ -169,6 +179,8 @@ def get_conv_gradweights_shape(image_shape, top_shape,
        width axis.
    num_groups: An int which specifies the number of separate groups to
        be divided into.
+    unshared: Boolean value. If true, unshared convolution will be performed,
+        where a different filter is applied to each area of the input.

    Returns
    -------
@@ -194,7 +206,10 @@ def get_conv_gradweights_shape(image_shape, top_shape,
        out_shp = tuple(get_conv_gradweights_shape_1axis(
            imshp[i], topshp[i], border_mode,
            subsample[i], filter_dilation[i]) for i in range(len(subsample)))
-    return (nchan, nkern) + out_shp
+    if unshared:
+        return (nchan,) + top_shape[2:] + (nkern,) + out_shp
+    else:
+        return (nchan, nkern) + out_shp


 def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,
@@ -280,6 +295,8 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
        width axis.
    num_groups: An int which specifies the number of separate groups to
        be divided into.
+    Note - The shape of the convolution output does not depend on the 'unshared'
+        parameter.

    Returns
    -------
@@ -289,7 +306,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,

    """
    bsize, topshp = top_shape[0], top_shape[2:]
-    nkern, kshp = kernel_shape[1], kernel_shape[2:]
+
+    convdim = len(top_shape) - 2
+    nkern, kshp = kernel_shape[1], kernel_shape[-convdim:]

    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')
@@ -522,7 +541,8 @@ def conv2d(input,
           subsample=(1, 1),
           filter_flip=True,
           filter_dilation=(1, 1),
-           num_groups=1):
+           num_groups=1,
+           unshared=False):
    """This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
@@ -538,7 +558,8 @@ def conv2d(input,
                             subsample=subsample,
                             filter_flip=filter_flip,
                             filter_dilation=filter_dilation,
-                             num_groups=num_groups)
+                             num_groups=num_groups,
+                             unshared=unshared)
    return conv_op(input, filters)


@@ -878,7 +899,6 @@ def conv3d(input,
        version until it is released.

    """
-
    input = as_tensor_variable(input)
    filters = as_tensor_variable(filters)
    conv_op = AbstractConv3d(imshp=input_shape,
@@ -899,7 +919,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                           subsample=(1, 1),
                           filter_flip=True,
                           filter_dilation=(1, 1),
-                           num_groups=1):
+                           num_groups=1,
+                           unshared=False):
    """Compute conv output gradient w.r.t its inputs

    This function builds the symbolic graph for getting the
@@ -916,10 +937,14 @@ def conv2d_grad_wrt_inputs(output_grad,
        will be upsampled or the output gradient of the convolution
        whose gradient will be taken with respect to the input of the
        convolution.
-    filters : symbolic 4D tensor
-        set of filters used in CNN layer of shape (output channels,
-        input channels, filter rows, filter columns).  See the
-        optional parameter ``filter_shape``.
+    filters: symbolic 4D or 6D tensor
+        Set of filters used in CNN layer of shape
+        (output channels, input channels, filter rows, filter columns)
+        for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns)
+        for unshared convolution.
+        See the optional parameter ``filter_shape``.
    input_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
        The shape of the input (upsampled) parameter.
        A tuple/list of len 4, with the first two dimensions
@@ -928,8 +953,9 @@ def conv2d_grad_wrt_inputs(output_grad,
        Not Optional, since given the output_grad shape
        and the subsample values, multiple input_shape may be
        plausible.
-    filter_shape : None or [None/int/Constant] * 4
-        The shape of the filters parameter. None or a tuple/list of len 4.
+    filter_shape : None or [None/int/Constant] * (4 or 6)
+        The shape of the filters parameter. None or a tuple/list of len 4 or a
+        tuple/list of len 6 (for unshared convolution)
        Optional, possibly used  to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that
        this element is not known at compile time.
@@ -975,6 +1001,10 @@ def conv2d_grad_wrt_inputs(output_grad,
    num_groups : int
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.

    Returns
    -------
@@ -1009,7 +1039,14 @@ def conv2d_grad_wrt_inputs(output_grad,

    # checking the type of filter_shape
    if filter_shape is not None:
-        for dim in [0, 1, 2, 3]:
+        if unshared:
+            expected_dim = 6
+        else:
+            expected_dim = 4
+
+        assert len(filter_shape) == expected_dim
+
+        for dim in range(expected_dim):
            assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
                                                  integer_types, type(None)))

@@ -1026,7 +1063,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                                              subsample=subsample,
                                              filter_flip=filter_flip,
                                              filter_dilation=filter_dilation,
-                                              num_groups=num_groups)
+                                              num_groups=num_groups,
+                                              unshared=unshared)

    return grad_input_op(filters, output_grad, input_shape[-2:])

@@ -1179,7 +1217,8 @@ def conv2d_grad_wrt_weights(input,
                            subsample=(1, 1),
                            filter_flip=True,
                            filter_dilation=(1, 1),
-                            num_groups=1):
+                            num_groups=1,
+                            unshared=False):
    """Compute conv output gradient w.r.t its weights

    This function will build the symbolic graph for getting the
@@ -1195,10 +1234,10 @@ def conv2d_grad_wrt_weights(input,
        mini-batch of feature map stacks, of shape (batch size, input
        channels, input rows, input columns).  This is the gradient of
        the output of convolution.
-    filter_shape : [None/int/Constant] * 2 + [Tensor/int/Constant] * 2
-        The shape of the filter parameter.  A tuple/list of len 4, with the
-        first two dimensions being None or int or Constant and the last two
-        dimensions being Tensor or int or Constant.
+    filter_shape : [None/int/Constant] * (2 or 4) + [Tensor/int/Constant] * 2
+        The shape of the filter parameter.  A tuple/list of len 4 or 6
+        (for unshared), with the first two dimensions being None or int or
+        Constant and the last two dimensions being Tensor or int or Constant.
        Not Optional, since given the output_grad shape and
        the input_shape, multiple filter_shape may be plausible.
    input_shape : None or [None/int/Constant] * 4
@@ -1247,13 +1286,19 @@ def conv2d_grad_wrt_weights(input,
    num_groups : int
        Divides the image, kernel and output tensors into num_groups
        separate groups. Each which carry out convolutions separately
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.

    Returns
    -------
-    symbolic 4D tensor
+    symbolic 4D tensor or 6D tensor
        set of feature maps generated by convolutional layer. Tensor
        is of shape (batch size, output channels, output rows, output
-        columns)
+        columns) for normal convolution and
+        (output channels, output rows, output columns, input channels,
+        filter rows, filter columns) for unshared convolution

    Notes
    -----
@@ -1274,7 +1319,11 @@ def conv2d_grad_wrt_weights(input,
    for dim in [0, 1]:
        assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
                                              integer_types, type(None)))
-    for dim in [2, 3]:
+    if unshared:
+        for dim in [2, 3]:
+            assert isinstance(filter_shape[dim], (theano.tensor.TensorConstant,
+                                                  integer_types, type(None)))
+    for dim in [-2, -1]:
        assert isinstance(filter_shape[dim], (theano.tensor.TensorVariable,
                                              theano.tensor.TensorConstant,
                                              integer_types))
@@ -1288,7 +1337,7 @@ def conv2d_grad_wrt_weights(input,
    # setting the last two dimensions of filter_shape to None, if
    # the type of these dimensions is TensorVariable.
    numerical_filter_shape = list(filter_shape)
-    for dim in [2, 3]:
+    for dim in [-2, -1]:
        if isinstance(filter_shape[dim], theano.tensor.TensorVariable):
            numerical_filter_shape[dim] = None

@@ -1298,7 +1347,8 @@ def conv2d_grad_wrt_weights(input,
                                               subsample=subsample,
                                               filter_flip=filter_flip,
                                               filter_dilation=filter_dilation,
-                                               num_groups=num_groups)
+                                               num_groups=num_groups,
+                                               unshared=unshared)

    return gradWeight_op(input, output_grad, filter_shape[-2:])

@@ -1631,7 +1681,8 @@ class BaseAbstractConv(Op):
        element is not known at compile time.
        imshp is defined w.r.t the forward conv.

-     kshp: None, tuple/list of len ``(2 + convdim)`` of int or Constant variable
+     kshp: None, tuple/list of len ``(2 + convdim)`` or ``(2 + 2 * convdim)``
+        (for unshared) of int or Constant variable
        The shape of the filters parameter.
        Optional, possibly used to choose an optimal implementation.
        You can give ``None`` for any element of the list to specify that this
@@ -1670,14 +1721,24 @@ class BaseAbstractConv(Op):
    filter_dilation: tuple of len ``convdim``
        Factor by which to subsample (stride) the input.
        Also called dilation factor.
+
+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately
+
+    unshared: bool
+        If true, then unshared or 'locally connected' convolution will be
+        performed. A different filter will be used for each region of the
+        input.
    """
    check_broadcast = False
    __props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
-                 'imshp', 'kshp', 'filter_dilation', 'num_groups')
+                 'imshp', 'kshp', 'filter_dilation', 'num_groups', 'unshared')

    def __init__(self, convdim,
                 imshp=None, kshp=None, border_mode="valid",
-                 subsample=None, filter_flip=True, filter_dilation=None, num_groups=1):
+                 subsample=None, filter_flip=True, filter_dilation=None, num_groups=1,
+                 unshared=False):

        self.convdim = convdim
        if convdim not in (2, 3):
@@ -1718,7 +1779,10 @@ class BaseAbstractConv(Op):
                            ValueError("imshp should be None or a tuple of "
                                       "constant int values"),
                            sys.exc_info()[2])
-        self.kshp = tuple(kshp) if kshp else (None,) * (2 + convdim)
+        if kshp:
+            self.kshp = tuple(kshp)
+        else:
+            self.kshp = (None,) * ((2 + 2 * convdim) if unshared else (2 + convdim))
        for kshp_i in self.kshp:
            if kshp_i is not None:
                # Components of kshp should be constant or ints
@@ -1742,6 +1806,10 @@ class BaseAbstractConv(Op):
        if num_groups < 1:
            raise ValueError("num_groups must have value greater than zero")
        self.num_groups = num_groups
+        if unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
+        self.unshared = unshared

    def do_constant_folding(self, node):
        # Disable constant folding since there is no implementation.
@@ -1768,11 +1836,10 @@ class BaseAbstractConv(Op):
            raise NotImplementedError(
                'flops not implemented for convdim={}', self.convdim)

-    def conv(self, img, kern, mode="valid", dilation=1, num_groups=1):
+    def conv(self, img, kern, mode="valid", dilation=1, num_groups=1, unshared=False, direction="forward"):
        """
        Basic slow Python 2D or 3D convolution for DebugMode
        """
-
        if not imported_scipy_signal:
            raise NotImplementedError(
                "AbstractConv perform requires the python package"
@@ -1787,18 +1854,29 @@ class BaseAbstractConv(Op):
            raise ValueError(
                'invalid dilation {}, expected {} values'.format(dilation,
                                                                 self.convdim))
+        if unshared and direction == "backprop weights":
+            if mode != "valid":
+                raise ValueError('conv mode for unshared backprop wrt weights must be "valid"')
+            # To allow the same format for the call to 'unshared2d' for all three directions,
+            # the out_shape is shuffled here.
+            # We do a transpose in the 'perform' function to bring it to the required shape
+            out_shape = (img.shape[0], kern.shape[0],
+                         kern.shape[2], kern.shape[3],
+                         img.shape[2] - kern.shape[2] + 1,
+                         img.shape[3] - kern.shape[3] + 1)
+        else:
+            out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                              mode, [1] * self.convdim, dilation)

-        out_shape = get_conv_output_shape(img.shape, kern.shape,
-                                          mode, [1] * self.convdim, dilation)
-
-        out = np.zeros(out_shape, dtype=img.dtype)
        dil_kern_shp = kern.shape[:-self.convdim] + tuple(
            (kern.shape[-self.convdim + i] - 1) * dilation[i] + 1
            for i in range(self.convdim))
        dilated_kern = np.zeros(dil_kern_shp, dtype=kern.dtype)
-        dilated_kern[(slice(None), slice(None)) +
+
+        dilated_kern[(slice(None),) * (dilated_kern.ndim - self.convdim) +
                     tuple(slice(None, None, dilation[i]) for i in range(self.convdim))
                     ] = kern
+        out = np.zeros(out_shape, dtype=img.dtype)

        if img.shape[1] % self.num_groups != 0:
            raise ValueError(
@@ -1823,11 +1901,19 @@ class BaseAbstractConv(Op):
                    for g in xrange(self.num_groups):
                        for n in xrange(output_channel_offset):
                            for im0 in xrange(input_channel_offset):
-                                # some cast generates a warning here
-                                out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
-                                                                                          dilated_kern[g * output_channel_offset + n,
-                                                                                          im0, ...], 1, val, bval, 0)
+                                if unshared:
+                                    out[b, g * output_channel_offset + n, ...] += self.unshared2d(img[b, g * input_channel_offset + im0, ...],
+                                                                                                  dilated_kern[g * output_channel_offset + n, im0, ...],
+                                                                                                  out_shape[2:], direction)
+                                else:
+                                    # some cast generates a warning here
+                                    out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
+                                                                                              dilated_kern[g * output_channel_offset + n, im0, ...],
+                                                                                              1, val, bval, 0)
+
        elif self.convdim == 3:
+            if unshared:
+                raise NotImplementedError('Unshared 3D convolution is not implemented')
            for b in xrange(img.shape[0]):
                for g in xrange(self.num_groups):
                    for n in xrange(output_channel_offset):
@@ -1839,6 +1925,35 @@ class BaseAbstractConv(Op):
            raise NotImplementedError('only 2D and 3D convolution are implemented')
        return out

+    def unshared2d(self, inp, kern, out_shape, direction="forward"):
+        '''
+        Basic slow Python unshared 2d convolution.
+        '''
+        if self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
+        out = np.zeros(out_shape, dtype=inp.dtype)
+
+        if direction == "forward":
+            for row in xrange(out_shape[0]):
+                for col in xrange(out_shape[1]):
+                    out[row, col] = np.sum(np.multiply(inp[row:row + kern.shape[2],
+                                                       col:col + kern.shape[3]],
+                                           kern[row, col, ::-1, ::-1]))
+        elif direction == "backprop weights":
+            for row in xrange(out_shape[0]):
+                for col in xrange(out_shape[1]):
+                    out[row, col, ...] = kern[row, col] * \
+                        inp[row:row + out_shape[2], col:col + out_shape[3]]
+        elif direction == "backprop inputs":
+            for row in xrange(kern.shape[0]):
+                for col in xrange(kern.shape[1]):
+                    out[row:row + kern.shape[2], col:col + kern.shape[3]] += inp[row, col] * \
+                        kern[row, col, ::-1, ::-1]
+        else:
+            raise ValueError("unshared2d: invalid value '{}' for 'direction'".format(direction))
+        return out
+

 class AbstractConv(BaseAbstractConv):
    """ Abstract Op for the forward convolution.
@@ -1854,14 +1969,16 @@ class AbstractConv(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv, self).__init__(convdim=convdim,
                                           imshp=imshp, kshp=kshp,
                                           border_mode=border_mode,
                                           subsample=subsample,
                                           filter_flip=filter_flip,
                                           filter_dilation=filter_dilation,
-                                           num_groups=num_groups)
+                                           num_groups=num_groups,
+                                           unshared=unshared)

    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -1875,8 +1992,14 @@ class AbstractConv(BaseAbstractConv):

        if img.type.ndim != 2 + self.convdim:
            raise TypeError('img must be %dD tensor' % (2 + self.convdim))
-        if kern.type.ndim != 2 + self.convdim:
-            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+
+        if self.unshared:
+            if kern.type.ndim != 2 + 2 * self.convdim:
+                raise TypeError('kern must be %dD tensor for unshared convolution'
+                                % (2 + 2 * self.convdim))
+        else:
+            if kern.type.ndim != 2 + self.convdim:
+                raise TypeError('kern must be %dD tensor' % (2 + self.convdim))

        img = assert_shape(img, self.imshp,
                           'AbstractConv shape mismatch: shape of '
@@ -1894,8 +2017,12 @@ class AbstractConv(BaseAbstractConv):
        img, kern = inp
        img = np.asarray(img)
        kern = np.asarray(kern)
-        dil_kernshp = tuple((kern.shape[2 + i] - 1) * self.filter_dilation[i] + 1
+
+        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
                            for i in range(self.convdim))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)
        o, = out_
        mode = self.border_mode

@@ -1922,8 +2049,34 @@ class AbstractConv(BaseAbstractConv):
                          for i in range(self.convdim))] = img
            img = new_img
        if not self.filter_flip:
-            kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim]
-        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups)
+            kern = kern[(slice(None),) * (kern.ndim - self.convdim) + (slice(None, None, -1),) * self.convdim]
+
+        if self.unshared:
+            out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                              mode, self.subsample, self.filter_dilation)
+            if kern.shape[1:1 + self.convdim] != out_shape[2:2 + self.convdim]:
+                raise ValueError('Kernel shape {} does not match '
+                                 'computed output size {}'.format(kern.shape[1:1 + self.convdim],
+                                                                  out_shape[2:2 + self.convdim]))
+            if any(self.subsample[i] > 1 for i in range(self.convdim)):
+                # Expand regions in kernel to correct for subsampling
+                out_shape = get_conv_output_shape(img.shape, kern.shape,
+                                                  mode, (1,) * self.convdim, self.filter_dilation)
+                exp_kern_shp = kern.shape[:1] + out_shape[2:2 + self.convdim] + \
+                    kern.shape[1 + self.convdim:]
+                exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
+                exp_kern[(slice(None),) +
+                         tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
+                         (slice(None),) * (self.convdim + 1)] = kern
+                kern = exp_kern
+            # from (nFilters, out_rows, out_cols, nChannels, kH, kW)
+            # to (nFilters, nChannels, out_rows, out_cols, kH, kW)
+            axes_order = (0, 1 + self.convdim,) + tuple(range(1, 1 + self.convdim)) + \
+                tuple(range(2 + self.convdim, kern.ndim))
+            kern = kern.transpose(axes_order)
+
+        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups,
+                             unshared=self.unshared)
        conv_out = conv_out[(slice(None), slice(None)) +
                            tuple(slice(None, None, self.subsample[i])
                                  for i in range(self.convdim))]
@@ -1934,6 +2087,8 @@ class AbstractConv(BaseAbstractConv):
        if self.num_groups > 1:
            raise NotImplementedError(
                'Rop not implemented for grouped convolutions')
+        if self.unshared:
+            raise NotImplementedError('Rop not implemented for unshared convolution')
        rval = None
        if eval_points[0] is not None:
            rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
@@ -1953,8 +2108,12 @@ class AbstractConv(BaseAbstractConv):
            imshp = [imshp[i] if self.imshp[i] is None else self.imshp[i]
                     for i in range(2 + self.convdim)]
        if self.kshp is not None:
-            kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
-                    for i in range(2 + self.convdim)]
+            if self.unshared:
+                kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+                        for i in range(2 + 2 * self.convdim)]
+            else:
+                kshp = [kshp[i] if self.kshp[i] is None else self.kshp[i]
+                        for i in range(2 + self.convdim)]
        res = get_conv_output_shape(imshp, kshp, self.border_mode,
                                    self.subsample, self.filter_dilation)
        return [res]
@@ -1973,14 +2132,16 @@ class AbstractConv2d(AbstractConv):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d, self).__init__(convdim=2,
                                             imshp=imshp, kshp=kshp,
                                             border_mode=border_mode,
                                             subsample=subsample,
                                             filter_flip=filter_flip,
                                             filter_dilation=filter_dilation,
-                                             num_groups=num_groups)
+                                             num_groups=num_groups,
+                                             unshared=unshared)

    def grad(self, inp, grads):
        bottom, weights = inp
@@ -1991,14 +2152,16 @@ class AbstractConv2d(AbstractConv):
                                             self.subsample,
                                             self.filter_flip,
                                             self.filter_dilation,
-                                             num_groups=self.num_groups)(
+                                             num_groups=self.num_groups,
+                                             unshared=self.unshared)(
            weights, top, bottom.shape[-2:], add_assert_shape=False)
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
                                               self.subsample,
                                               self.filter_flip,
                                               self.filter_dilation,
-                                               num_groups=self.num_groups)(
+                                               num_groups=self.num_groups,
+                                               unshared=self.unshared)(

            bottom, top, weights.shape[-2:], add_assert_shape=False)

@@ -2085,14 +2248,16 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
                                                       imshp=imshp, kshp=kshp,
                                                       border_mode=border_mode,
                                                       subsample=subsample,
                                                       filter_flip=filter_flip,
                                                       filter_dilation=filter_dilation,
-                                                       num_groups=num_groups)
+                                                       num_groups=num_groups,
+                                                       unshared=unshared)

    # Update shape/height_width
    def make_node(self, img, topgrad, shape, add_assert_shape=True):
@@ -2115,8 +2280,12 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                               'image does not match given imshp.')

        shape = as_tensor_variable(shape)
-        broadcastable = [topgrad.broadcastable[1],
-                         img.broadcastable[1]] + ([False] * self.convdim)
+        if self.unshared:
+            broadcastable = [topgrad.broadcastable[1]] + ([False] * self.convdim) + \
+                            [img.broadcastable[1]] + ([False] * self.convdim)
+        else:
+            broadcastable = [topgrad.broadcastable[1],
+                             img.broadcastable[1]] + ([False] * self.convdim)
        output = img.type.clone(broadcastable=broadcastable)()
        return Apply(self, [img, topgrad, shape], [output])

@@ -2134,6 +2303,9 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                'invalid border_mode {}, which must be either '
                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)

        dil_shape = tuple((shape[i] - 1) * self.filter_dilation[i] + 1
                          for i in range(self.convdim))
@@ -2166,9 +2338,7 @@ class AbstractConv_gradWeights(BaseAbstractConv):
            topgrad = new_topgrad

        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
-        flip_filters = ((slice(None), slice(None)) +
-                        (slice(None, None, -1),) * self.convdim)
-        topgrad = topgrad.transpose(axes_order)[flip_filters]
+        topgrad = topgrad.transpose(axes_order)
        img = img.transpose(axes_order)

        def correct_for_groups(mat):
@@ -2182,15 +2352,36 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        if self.num_groups > 1:
            img = correct_for_groups(img)

-        kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
+        if self.unshared:
+            flip_kern = ((slice(None),) * (2 + self.convdim) +
+                         (slice(None, None, -1),) * self.convdim)
+            kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups,
+                             unshared=True, direction="backprop weights")
+            if any(self.subsample[i] > 1 for i in range(self.convdim)):
+                sub_slice = (slice(None),) * 2 + \
+                    tuple(slice(None, None, self.subsample[i]) for i in range(0, self.convdim)) + \
+                    (slice(None),) * self.convdim
+                kern = kern[sub_slice]
+            # from (nChannels, nFilters, out_rows, out_cols, kH, kW)
+            # to (nFilters, out_rows, out_cols, nChannels, kH, kW)
+            kern_axes = (1,) + tuple(range(2, self.convdim + 2)) + (0,) + \
+                tuple(range(self.convdim + 2, kern.ndim))
+        else:
+            flip_topgrad = flip_kern = ((slice(None), slice(None)) +
+                                        (slice(None, None, -1),) * self.convdim)
+            topgrad = topgrad[flip_topgrad]
+            kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
+            kern_axes = (1, 0) + tuple(range(2, self.convdim + 2))
+
+        kern = kern.transpose(kern_axes)
+
        if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
-            kern = kern[(slice(None), slice(None)) +
+            kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
                        tuple(slice(None, None, self.filter_dilation[i])
                              for i in range(self.convdim))]
+
        if self.filter_flip:
-            kern = kern.transpose(axes_order)[flip_filters]
-        else:
-            kern = kern.transpose(axes_order)
+            kern = kern[flip_kern]
        o[0] = node.outputs[0].type.filter(kern)

    def connection_pattern(self, node):
@@ -2203,15 +2394,24 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        # from the shapes of inputs.
        imshp = input_shapes[0]
        topshp = input_shapes[1]
-        kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
-        if self.num_groups > 1:
-            fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
+
+        if self.kshp:
+            kshp = self.kshp
+        else:
+            if self.unshared:
+                kshp = [None] * (2 + 2 * self.convdim)
+            else:
+                kshp = [None] * (2 + self.convdim)
+        if self.unshared:
+            fallback_kshp = ([topshp[1], topshp[2], topshp[3], imshp[1] // self.num_groups] +
                             [node.inputs[2][i] for i in range(self.convdim)])
+            kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
+                    for i in range(2 + 2 * self.convdim)]
        else:
-            fallback_kshp = ([topshp[1], imshp[1]] +
+            fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
                             [node.inputs[2][i] for i in range(self.convdim)])
-        kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
-                for i in range(2 + self.convdim)]
+            kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
+                    for i in range(2 + self.convdim)]
        return [kshp]


@@ -2232,14 +2432,16 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
                                                         imshp=imshp, kshp=kshp,
                                                         border_mode=border_mode,
                                                         subsample=subsample,
                                                         filter_flip=filter_flip,
                                                         filter_dilation=filter_dilation,
-                                                         num_groups=num_groups)
+                                                         num_groups=num_groups,
+                                                         unshared=unshared)

    def grad(self, inp, grads):
        bottom, top = inp[:2]
@@ -2249,16 +2451,18 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                                             self.subsample,
                                             self.filter_flip,
                                             self.filter_dilation,
-                                             self.num_groups)(weights,
-                                                              top,
-                                                              bottom.shape[-2:])
+                                             self.num_groups,
+                                             self.unshared)(weights,
+                                                            top,
+                                                            bottom.shape[-2:])
        d_top = AbstractConv2d(self.imshp,
                               self.kshp,
                               self.border_mode,
                               self.subsample,
                               self.filter_flip,
                               self.filter_dilation,
-                               self.num_groups)(bottom, weights)
+                               self.num_groups,
+                               self.unshared)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.
@@ -2350,14 +2554,16 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                 subsample=None,
                 filter_flip=True,
                 filter_dilation=None,
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
                                                      imshp=imshp, kshp=kshp,
                                                      border_mode=border_mode,
                                                      subsample=subsample,
                                                      filter_flip=filter_flip,
                                                      filter_dilation=filter_dilation,
-                                                      num_groups=num_groups)
+                                                      num_groups=num_groups,
+                                                      unshared=unshared)

    # Update shape/height_width
    def make_node(self, kern, topgrad, shape, add_assert_shape=True):
@@ -2370,10 +2576,19 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                                broadcastable=topgrad.broadcastable)
        topgrad = gtype.filter_variable(topgrad)

-        if kern.type.ndim != 2 + self.convdim:
-            raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+        if self.unshared:
+            if self.convdim != 2:
+                raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                          % self.convdim)
+            elif kern.type.ndim != 2 + 2 * self.convdim:
+                raise TypeError('kern must be %dD tensor for unshared convolution'
+                                % (2 + 2 * self.convdim))
+        else:
+            if kern.type.ndim != 2 + self.convdim:
+                raise TypeError('kern must be %dD tensor' % (2 + self.convdim))
+
        if topgrad.type.ndim != 2 + self.convdim:
-            raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))
+                raise TypeError('topgrad must be %dD tensor' % (2 + self.convdim))

        if add_assert_shape:
            kern = assert_shape(kern, self.kshp,
@@ -2386,7 +2601,7 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                             False] + ([False] * self.convdim)
        else:
            broadcastable = [topgrad.type.broadcastable[0],
-                             kern.type.broadcastable[1]] + ([False] * self.convdim)
+                             kern.type.broadcastable[-self.convdim - 1]] + ([False] * self.convdim)
        output = kern.type.clone(broadcastable=broadcastable)()
        return Apply(self, [kern, topgrad, shape], [output])

@@ -2403,9 +2618,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                'invalid border_mode {}, which must be either '
                '"valid", "full", "half", an integer or a tuple of'
                ' integers'.format(mode))
+        if self.unshared and self.convdim != 2:
+            raise NotImplementedError('Unshared convolution not implemented for %dD'
+                                      % self.convdim)

        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
-        fallback_imshp = ([topgrad.shape[0], kern.shape[1]] +
+        fallback_imshp = ([topgrad.shape[0], kern.shape[-self.convdim - 1]] +
                          [shape[i] for i in range(self.convdim)])
        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
                 for i in range(2 + self.convdim)]
@@ -2419,8 +2637,9 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                'has shape {}'.format(tuple(expected_topgrad_shape),
                                      tuple(topgrad.shape)))

-        dil_kernshp = tuple((kern.shape[i + 2] - 1) * self.filter_dilation[i] + 1
+        dil_kernshp = tuple((kern.shape[-self.convdim + i] - 1) * self.filter_dilation[i] + 1
                            for i in range(self.convdim))
+
        pad = (0,) * self.convdim
        if mode == "full":
            pad = tuple(dil_kernshp[i] - 1 for i in range(self.convdim))
@@ -2438,25 +2657,54 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                              for i in range(self.convdim))] = topgrad
            topgrad = new_topgrad

-        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
-        flip_filters = ((slice(None), slice(None)) +
-                        (slice(None, None, -1),) * self.convdim)
+            if self.unshared:
+                # Expand regions in kernel to correct for subsampling
+                exp_kern_shp = kern.shape[:1] + topgrad.shape[2:] + kern.shape[1 + self.convdim:]
+                exp_kern = np.zeros(exp_kern_shp, dtype=kern.dtype)
+                exp_kern[(slice(None),) +
+                         tuple(slice(None, None, self.subsample[i]) for i in range(self.convdim)) +
+                         (slice(None),) * (self.convdim + 1)] = kern
+                kern = exp_kern

        def correct_for_groups(mat):
            mshp0 = mat.shape[0] // self.num_groups
-            mshp1 = mat.shape[1] * self.num_groups
+            mshp1 = mat.shape[-self.convdim - 1] * self.num_groups
            mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
-            mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
-            mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
+            if self.unshared:
+                # for 2D -> (1, 2, 3, 0, 4, 5, 6)
+                mat = mat.transpose(tuple(range(1, 2 + self.convdim)) + (0,) +
+                                    tuple(range(2 + self.convdim, mat.ndim)))
+                mat = mat.reshape((mshp0,) + mat.shape[1:1 + self.convdim] + (mshp1,) + mat.shape[-self.convdim:])
+            else:
+                mat = mat.transpose((1, 0, 2) + tuple(range(3, 3 + self.convdim)))
+                mat = mat.reshape((mshp0, mshp1) + mat.shape[-self.convdim:])
            return mat
+
        kern = correct_for_groups(kern)
-        kern = kern.transpose(axes_order)

-        if self.filter_flip:
-            topgrad = topgrad[flip_filters]
-        img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation, num_groups=self.num_groups)
-        if self.filter_flip:
-            img = img[flip_filters]
+        if self.unshared:
+            # from (nFilters, out_rows, out_cols, nChannels, kH, kW)
+            # to (nChannels, nFilters, out_rows, out_cols, kH, kW)
+            axes_order = (1 + self.convdim, 0,) + tuple(range(1, 1 + self.convdim)) + \
+                tuple(range(2 + self.convdim, kern.ndim))
+            kern = kern.transpose(axes_order)
+            if not self.filter_flip:
+                kern = kern[(slice(None),) * (kern.ndim - self.convdim) +
+                            (slice(None, None, -1),) * self.convdim]
+            img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
+                            num_groups=self.num_groups, unshared=True, direction="backprop inputs")
+        else:
+            axes_order = (1, 0) + tuple(range(2, 2 + self.convdim))
+            kern = kern.transpose(axes_order)
+            flip_filters = ((slice(None), slice(None)) +
+                            (slice(None, None, -1),) * self.convdim)
+            if self.filter_flip:
+                topgrad = topgrad[flip_filters]
+            img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation,
+                            num_groups=self.num_groups)
+            if self.filter_flip:
+                img = img[flip_filters]
+
        if any(p > 0 for p in pad):
            img = img[(slice(None), slice(None)) +
                      tuple(slice(pad[i], img.shape[i + 2] - pad[i])
@@ -2475,10 +2723,10 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        topshp = input_shapes[1]
        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
        if self.num_groups > 1:
-            fallback_imshp = ([topshp[0], kshp[1] * self.num_groups] +
+            fallback_imshp = ([topshp[0], kshp[-self.convdim - 1] * self.num_groups] +
                              [node.inputs[2][i] for i in range(self.convdim)])
        else:
-            fallback_imshp = ([topshp[0], kshp[1]] +
+            fallback_imshp = ([topshp[0], kshp[-self.convdim - 1]] +
                              [node.inputs[2][i] for i in range(self.convdim)])
        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
                 for i in range(2 + self.convdim)]
@@ -2503,14 +2751,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                 subsample=(1, 1),
                 filter_flip=True,
                 filter_dilation=(1, 1),
-                 num_groups=1):
+                 num_groups=1,
+                 unshared=False):
        super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
                                                        imshp=imshp, kshp=kshp,
                                                        border_mode=border_mode,
                                                        subsample=subsample,
                                                        filter_flip=filter_flip,
                                                        filter_dilation=filter_dilation,
-                                                        num_groups=num_groups)
+                                                        num_groups=num_groups,
+                                                        unshared=unshared)

    def grad(self, inp, grads):
        weights, top = inp[:2]
@@ -2520,7 +2770,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                                               self.subsample,
                                               self.filter_flip,
                                               self.filter_dilation,
-                                               self.num_groups)(
+                                               self.num_groups,
+                                               self.unshared)(
                                                   bottom, top,
                                                   weights.shape[-2:])
        d_top = AbstractConv2d(self.imshp, self.kshp,
@@ -2528,7 +2779,8 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                               self.subsample,
                               self.filter_flip,
                               self.filter_dilation,
-                               self.num_groups)(bottom, weights)
+                               self.num_groups,
+                               self.unshared)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.

--- a/theano/tensor/nnet/c_code/corr_gemm.c
+++ b/theano/tensor/nnet/c_code/corr_gemm.c
@@ -107,7 +107,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                      const int dilW = 1,
                      const int padH = 0,
                      const int padW = 0,
-                      const int numgroups = 1)
+                      const int numgroups = 1,
+                      const int unshared = 0)
 {
    if (PyArray_NDIM(bottom) != 4)
    {
@@ -120,9 +121,9 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        return NULL;
    }

-    if (PyArray_NDIM(weight) != 4)
+    if (PyArray_NDIM(weight) != (unshared ? 6 : 4))
    {
-        PyErr_SetString(PyExc_ValueError, "CorrMM requires weight of 4D");
+        PyErr_Format(PyExc_ValueError, "CorrMM requires weight of %%dD", unshared ? 6 : 4);
        return NULL;
    }
    if (PyArray_TYPE(weight) != %(float_typenum)s)
@@ -152,11 +153,12 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int nChannels = PyArray_DIMS(bottom)[1];
    const int bottomHeight = PyArray_DIMS(bottom)[2];
    const int bottomWidth = PyArray_DIMS(bottom)[3];
-    // weights: (nFilters, nChannels, rows, columns)
+    // normal weights: (nFilters, nChannels, rows, columns)
+    // unshared weights: (nFilters, topHeight, topWidth, nChannels, rows, columns)
    const int nFilters = PyArray_DIMS(weight)[0];
-    const int kH = PyArray_DIMS(weight)[2];
-    const int kW = PyArray_DIMS(weight)[3];
-    if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) {
+    const int kH = PyArray_DIMS(weight)[unshared ? 4 : 2];
+    const int kW = PyArray_DIMS(weight)[unshared ? 5 : 3];
+    if (nChannels != PyArray_DIMS(weight)[unshared ? 3 : 1] * numgroups) {
        PyErr_SetString(PyExc_ValueError,
                "CorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -179,22 +181,57 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const int topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
 #undef _CONV_FLOORDIV
-    if (batchSize != PyArray_DIMS(top)[0] ||
-            nFilters != PyArray_DIMS(top)[1] ||
-            topHeight != PyArray_DIMS(top)[2] ||
-            topWidth != PyArray_DIMS(top)[3]) {
-        PyErr_Format(PyExc_ValueError,
-                "CorrMM shape inconsistency:\n"
-                "  bottom shape: %%d %%d %%d %%d\n"
-                "  weight shape: %%d %%d %%d %%d\n"
-                "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
-                batchSize, nChannels, bottomHeight, bottomWidth,
-                nFilters, nChannels / numgroups, kH, kW,
-                PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
-                PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
-                batchSize, nFilters, topHeight, topWidth);
-        return NULL;
+    if (unshared) {
+        if (topHeight != PyArray_DIMS(weight)[1] ||
+                topWidth != PyArray_DIMS(weight)[2]) {
+            PyErr_Format(PyExc_ValueError,
+                    "CorrMM regions in kernel must match output regions:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%ld %%ld %%d %%d %%d"
+                    " (expected %%d %%d %%d %%d %%d %%d)\n"
+                    "  top shape(calculated): %%d %%d %%d %%d\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, PyArray_DIMS(weight)[1],
+                    PyArray_DIMS(weight)[2], nChannels / numgroups, kH, kW,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+        if (batchSize != PyArray_DIMS(top)[0] ||
+                nFilters != PyArray_DIMS(top)[1] ||
+                topHeight != PyArray_DIMS(top)[2] ||
+                topWidth != PyArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "CorrMM shape inconsistency:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%d %%d %%d %%d %%d\n"
+                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, topHeight, topWidth, nChannels / numgroups, kH, kW,
+                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
    }
+    else {
+        if (batchSize != PyArray_DIMS(top)[0] ||
+                nFilters != PyArray_DIMS(top)[1] ||
+                topHeight != PyArray_DIMS(top)[2] ||
+                topWidth != PyArray_DIMS(top)[3]) {
+            PyErr_Format(PyExc_ValueError,
+                    "CorrMM shape inconsistency:\n"
+                    "  bottom shape: %%d %%d %%d %%d\n"
+                    "  weight shape: %%d %%d %%d %%d\n"
+                    "  top shape: %%ld %%ld %%ld %%ld (expected %%d %%d %%d %%d)\n",
+                    batchSize, nChannels, bottomHeight, bottomWidth,
+                    nFilters, nChannels / numgroups, kH, kW,
+                    PyArray_DIMS(top)[0], PyArray_DIMS(top)[1],
+                    PyArray_DIMS(top)[2], PyArray_DIMS(top)[3],
+                    batchSize, nFilters, topHeight, topWidth);
+            return NULL;
+        }
+    }        

    // Create temporary columns
    int max_threads = %(omp_get_max_threads)s;
@@ -230,8 +267,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int group_col_stride = (K_ * N_);
    const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
    const int M_ = nFilters / numgroups;
+    const int one_int = 1;
    const %(c_float_type)s one = 1.0;
    const %(c_float_type)s zero = 0.0;
+    const int ldw = (K_ * N_);
    char NTrans = 'N';
    char Trans = 'T';
    PyArrayObject *output;
@@ -266,15 +305,30 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            for ( int g = 0; g < numgroups; ++g){
-                // Second, gemm
-                %(gemm)s(&NTrans, &NTrans,
-                       &N_, &M_, &K_,
-                       &one,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
-                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                       &zero,
-                       (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
+            // Second, gemm
+            if (unshared) {
+                for (int g = 0; g < numgroups; ++g) {
+                    for (int reg = 0; reg < N_; ++reg) {
+                        %(gemv)s(&Trans, &K_, &M_,
+                                &one,
+                                (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
+                                (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
+                                &zero,
+                                (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride + reg, &N_);
+                    }
+                }
+            }
+            else {
+                for ( int g = 0; g < numgroups; ++g){
+                    // Second, gemm
+                    %(gemm)s(&NTrans, &NTrans,
+                           &N_, &M_, &K_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                           &zero,
+                           (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);  
+                }
            }
        }
        // Restore to previous blas threads
@@ -316,7 +370,10 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        output = weight;
        npy_intp weight_dim[2];
        weight_dim[0] = (npy_intp)max_threads;
-        weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
+        if (unshared)
+            weight_dim[1] = (npy_intp)(M_ * N_ * K_ * numgroups);            
+        else
+            weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
                                   weight_dim, PyArray_TYPE(weight), 0);

@@ -341,26 +398,46 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            for(int g = 0; g < numgroups; ++g){
-                // Second, gemm
-                // Note that we accumulate into weight. We do so by setting beta = 0
-                // for the first iteration and beta = 1 for subsequent ones. (This
-                // is faster than setting weight to all zeros before the loop.)
-                %(gemm)s(&Trans, &NTrans,
-                       &K_, &M_, &N_,
-                       &one,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
-                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
-                       (n == 0) ? &zero : &one,
-                       (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + 
-                       tid * weight_dim[1], &K_);
+            // Second, gemm
+            // Note that we accumulate into weight. We do so by setting beta = 0
+            // for the first iteration and beta = 1 for subsequent ones. (This
+            // is faster than setting weight to all zeros before the loop.)
+            if (unshared) {
+                for (int g = 0; g < numgroups; ++g) {
+                    for (int reg = 0; reg < N_; ++reg) {
+                        %(gemm)s(&Trans, &NTrans,
+                               &K_, &M_, &one_int,
+                               &one,
+                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_,
+                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
+                               (n == 0) ? &zero : &one,
+                               (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + reg * K_ +
+                               tid * weight_dim[1], &ldw);
+                    }
+                }
+            }
+            else {
+                for(int g = 0; g < numgroups; ++g){
+                    // Second, gemm
+                    // Note that we accumulate into weight. We do so by setting beta = 0
+                    // for the first iteration and beta = 1 for subsequent ones. (This
+                    // is faster than setting weight to all zeros before the loop.)
+                    %(gemm)s(&Trans, &NTrans,
+                           &K_, &M_, &N_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
+                           (n == 0) ? &zero : &one,
+                           (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + 
+                           tid * weight_dim[1], &K_);
+                }
            }
        }
        // Restore to previous blas threads
        %(blas_set_num_threads)s(blas_threads_saved);

        //aggregate weights
-        memset((%(float_type)s*)PyArray_DATA(weight), 0, M_ * K_*sizeof(%(float_type)s));
+        memset((%(float_type)s*)PyArray_DATA(weight), 0, weight_dim[1]*sizeof(%(float_type)s));
        /*
         * Put index "j" into outer loop to get the
         * correct result when openmp is used.
@@ -416,15 +493,29 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        %(omp_flags)s
        for (int n = 0; n < batchSize; ++n) {
            int tid = %(omp_get_thread_num)s;
-            for ( int g = 0;g < numgroups; ++g){
-                // gemm into columns
-                %(gemm)s(&NTrans, &Trans,
-                       &N_, &K_, &M_,
-                       &one,
-                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
-                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
-                       &zero,
-                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
+            if (unshared) {
+                for (int g = 0; g < numgroups; ++g){
+                    for (int reg = 0; reg < N_; ++reg){
+                        %(gemm)s(&NTrans, &Trans,
+                               &one_int, &K_, &M_,
+                               &one,
+                               (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride + reg, &N_,
+                               (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride + reg * K_, &ldw,
+                               &zero,
+                               (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride + reg, &N_);
+                    }
+                }
+            }
+            else {
+                for (int g = 0; g < numgroups; ++g) {
+                    %(gemm)s(&NTrans, &Trans,
+                           &N_, &K_, &M_,
+                           &one,
+                           (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
+                           (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                           &zero,
+                           (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
+                }
            }
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
@@ -474,4 +565,3 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
 }
-
--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -8,7 +8,7 @@ import theano
 from theano import Apply
 from theano import gof
 from theano.gof import ParamsType, EnumList
-from theano.scalar import int64
+from theano.scalar import int64, int8
 from theano.tensor import as_tensor_variable, TensorType
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor import blas_headers
@@ -42,9 +42,11 @@ class BaseCorrMM(gof.OpenMPOp):
        Perform dilated correlation (default: (1,1))
    num_groups
        Perform grouped convolutions (default: 1)
+    unshared
+        Perform unshared correlation (default: False)
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups', 'unshared')

    _direction = None

@@ -54,10 +56,10 @@ class BaseCorrMM(gof.OpenMPOp):
                             dH=int64, dW=int64,
                             dilH=int64, dilW=int64,
                             padH=int64, padW=int64,
-                             num_groups=int64)
+                             num_groups=int64, unshared=int8)

    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), num_groups=1, openmp=None):
+                 filter_dilation=(1, 1), num_groups=1, unshared=False, openmp=None):
        super(BaseCorrMM, self).__init__(openmp=openmp)
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
@@ -85,6 +87,7 @@ class BaseCorrMM(gof.OpenMPOp):
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
+        self.unshared = unshared

        if not theano.config.blas.ldflags:
            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
@@ -130,12 +133,13 @@ class BaseCorrMM(gof.OpenMPOp):
    padW = property(lambda self: self.pad[1])

    def __str__(self):
-        return '%s{%s, %s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
-            str(self.num_groups))
+            str(self.num_groups),
+            str(self.unshared))

    @staticmethod
    def as_common_dtype(in1, in2):
@@ -179,7 +183,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (7, self.openmp, blas_header_version())
+        return (9, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -189,12 +193,14 @@ class BaseCorrMM(gof.OpenMPOp):
        assert dtype in ('float32', 'float64')
        if dtype == 'float32':
            sub['gemm'] = 'sgemm_'
+            sub['gemv'] = 'sgemv_'
            sub['float_type'] = 'npy_float'
            sub['float_typenum'] = 'NPY_FLOAT'
            sub['n_bytes'] = 4
            sub['c_float_type'] = 'float'
        else:
            sub['gemm'] = 'dgemm_'
+            sub['gemv'] = 'dgemv_'
            sub['float_type'] = 'npy_double'
            sub['float_typenum'] = 'NPY_DOUBLE'
            sub['n_bytes'] = 8
@@ -287,6 +293,7 @@ class BaseCorrMM(gof.OpenMPOp):
    int padH = %(params)s->padH;
    int padW = %(params)s->padW;
    int numgroups = %(params)s->num_groups;
+    int unshared = %(params)s->unshared;

    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
@@ -310,13 +317,17 @@ class BaseCorrMM(gof.OpenMPOp):
            break;
    }

+    int wdim, odim;
+    wdim = unshared ? 6 : 4;
+    odim = 4; //Can be set to 6 later for unshared backprop wrt weights
+
    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
-        kH = PyArray_DIMS(weights)[2];
-        kW = PyArray_DIMS(weights)[3];
+        kH = PyArray_DIMS(weights)[wdim-2];
+        kW = PyArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
@@ -370,28 +381,45 @@ class BaseCorrMM(gof.OpenMPOp):
    }

    // Infer output shape
-    npy_intp out_dim[4];
+    npy_intp out_dim[6];
+    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
-        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
-        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
+        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
+        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
-            PyErr_Format(PyExc_ValueError,
-                         "CorrMM: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3]);
+            if (unshared) {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3]);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3]);
+            }
            %(fail)s
        }
        break;
@@ -399,46 +427,90 @@ class BaseCorrMM(gof.OpenMPOp):
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
-        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
-        out_dim[2] = (npy_intp)kH;  // already inferred further above
-        out_dim[3] = (npy_intp)kW;  // how convenient
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "CorrMM backprop wrt. weights: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
-                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3],
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-            %(fail)s
+        if (unshared){
+            odim = 6;
+            out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
+            out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
+        }
+        out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
+        out_dim[wdim-2] = (npy_intp)kH;  // already inferred further above
+        out_dim[wdim-1] = (npy_intp)kW;  // how convenient
+        if (unshared) {
+            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
+                    || out_dim[4] <= 0 || out_dim[5] <= 0){
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. weights: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
+                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
-        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
-        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
-        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
-        {
-            PyErr_Format(PyExc_ValueError,
-                         "CorrMM backprop wrt. inputs: impossible output shape\\n"
-                         "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
-                         "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
-                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
-                         (long int)out_dim[3],
-                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
-                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
-                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
-                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
-            %(fail)s
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
+        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - 2*padH);
+        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - 2*padW);
+        if (unshared) {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
+        }
+        else {
+            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,
+                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
+                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
+                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
+                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
+                             (long int)out_dim[3],
+                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
+                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
+                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
+                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
+                %(fail)s
+            }
        }
        break;
    default:
@@ -448,13 +520,19 @@ class BaseCorrMM(gof.OpenMPOp):

    // Prepare output array
    int typenum;
-    if ( !(*out
-           && PyArray_NDIM(*out)==4
+    int failure;
+    failure = !(*out
+           && PyArray_NDIM(*out)==odim
           && PyArray_IS_C_CONTIGUOUS(*out)
           && PyArray_DIMS(*out)[0]==out_dim[0]
           && PyArray_DIMS(*out)[1]==out_dim[1]
           && PyArray_DIMS(*out)[2]==out_dim[2]
-           && PyArray_DIMS(*out)[3]==out_dim[3]))
+           && PyArray_DIMS(*out)[3]==out_dim[3]);
+    if (odim == 6){
+        failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
+                && PyArray_DIMS(*out)[5]==out_dim[5]);
+    }
+    if ( failure )
    {
        Py_XDECREF(*out);
        if (direction != 1) {
@@ -464,21 +542,29 @@ class BaseCorrMM(gof.OpenMPOp):
          typenum = PyArray_TYPE(bottom);
        }
        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
-        *out = (PyArrayObject*)PyArray_ZEROS(4,
+        *out = (PyArrayObject*)PyArray_ZEROS(odim,
                                          out_dim,
                                          typenum,
                                          0);
        if (NULL == *out)
        {
-            PyErr_Format(PyExc_RuntimeError,
-                    "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
-                    (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
+            if (odim == 4) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
+                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
+            }
+            if (odim == 6) {
+                PyErr_Format(PyExc_RuntimeError,
+                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
+                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
+                        (long long)out_dim[4], (long long)out_dim[5]);
+            }
            %(fail)s
        }
    }

    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups );
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
@@ -514,6 +600,13 @@ class CorrMM(BaseCorrMM):
        The filter dilation operation applied to each input image.
        Should be a tuple with 2 elements.
        Set to `(1, 1)` to disable filter dilation.
+    num_groups
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately.
+        Should be an integer.
+    unshared
+        Boolean value. If true, then a different filter will be applied to
+        each region of the input image.

    """

@@ -525,8 +618,12 @@ class CorrMM(BaseCorrMM):
        img, kern = self.as_common_dtype(img, kern)
        if img.type.ndim != 4:
            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+        if self.unshared is True:
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')

        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
                         False, False]
@@ -555,13 +652,15 @@ class CorrMM(BaseCorrMM):
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
                                     self.filter_dilation,
-                                     self.num_groups)(weights, top,
-                                                      bottom.shape[-2:])
+                                     self.num_groups,
+                                     self.unshared)(weights, top,
+                                                    bottom.shape[-2:])
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
                                       self.filter_dilation,
-                                       self.num_groups)(bottom, top,
-                                                        weights.shape[-2:])
+                                       self.num_groups,
+                                       self.unshared)(bottom, top,
+                                                      weights.shape[-2:])
        return d_bottom, d_weights


@@ -595,8 +694,12 @@ class CorrMM_gradWeights(BaseCorrMM):
        else:
            height_width = [as_tensor_variable(shape[0]).astype('int64'), as_tensor_variable(shape[1]).astype('int64')]

-        broadcastable = [topgrad.type.broadcastable[1], img.type.broadcastable[1],
-                         False, False]
+        if self.unshared is True:
+            broadcastable = [topgrad.type.broadcastable[0], False, False,
+                             img.type.broadcastable[1], False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[0], img.type.broadcastable[1],
+                             False, False]
        dtype = img.type.dtype
        return Apply(self, [img, topgrad] + height_width,
                     [TensorType(dtype, broadcastable)()])
@@ -633,7 +736,10 @@ class CorrMM_gradWeights(BaseCorrMM):
            kW = 2 - imshp[1] + (topshp[1] - 1) * dW
        else:
            kW = imshp[1] + 2 * padW - (topshp[1] - 1) * dW
-        return [(nkern, ssize, kH, kW)]
+        if self.unshared is True:
+            return [(nkern, topshp[0], topshp[1], ssize, kH, kW)]
+        else:
+            return [(nkern, ssize, kH, kW)]

    def c_code(self, node, nodename, inp, out_, sub):
        bottom, top = inp[:2]
@@ -649,12 +755,14 @@ class CorrMM_gradWeights(BaseCorrMM):
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
                                     self.filter_dilation,
-                                     self.num_groups)(weights, top,
-                                                      bottom.shape[-2:])
+                                     self.num_groups,
+                                     self.unshared)(weights, top,
+                                                    bottom.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
                       self.filter_dilation,
-                       self.num_groups)(bottom, weights)
+                       self.num_groups,
+                       self.unshared)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
                          if len(inp) == 4 else ())
        return (d_bottom, d_top) + d_height_width
@@ -684,8 +792,12 @@ class CorrMM_gradInputs(BaseCorrMM):
        kern = as_tensor_variable(kern)
        topgrad = as_tensor_variable(topgrad)
        kern, topgrad = self.as_common_dtype(kern, topgrad)
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
+        if self.unshared is True:
+            if kern.type.ndim != 6:
+                raise TypeError('kern must be 6D tensor')
+        else:
+            if kern.type.ndim != 4:
+                raise TypeError('kern must be 4D tensor')
        if topgrad.type.ndim != 4:
            raise TypeError('topgrad must be 4D tensor')
        if shape is None:
@@ -700,7 +812,7 @@ class CorrMM_gradInputs(BaseCorrMM):
            broadcastable = [topgrad.type.broadcastable[0], False,
                             False, False]
        else:
-            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[-3],
                             False, False]
        dtype = kern.type.dtype
        return Apply(self, [kern, topgrad] + height_width,
@@ -719,7 +831,7 @@ class CorrMM_gradInputs(BaseCorrMM):
        dH, dW = self.subsample
        kshp = input_shape[0]
        topshp = input_shape[1]
-        ssize, kshp = kshp[1], list(kshp[2:])
+        ssize, kshp = kshp[-3], list(kshp[-2:])
        ssize = ssize * self.num_groups
        bsize, topshp = topshp[0], list(topshp[2:])
        height_width = node.inputs[-2:]
@@ -762,13 +874,15 @@ class CorrMM_gradInputs(BaseCorrMM):
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
                                       self.filter_dilation,
-                                       self.num_groups)(bottom,
-                                                        top,
-                                                        weights.shape[-2:])
+                                       self.num_groups,
+                                       self.unshared)(bottom,
+                                                      top,
+                                                      weights.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
                       self.filter_dilation,
-                       self.num_groups)(bottom, weights)
+                       self.num_groups,
+                       self.unshared)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) *
                          2 if len(inp) == 4 else ())
        return (d_weights, d_top) + d_height_width

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -82,12 +82,14 @@ def local_abstractconv_gemm(node):

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM(border_mode=node.op.border_mode,
                  subsample=node.op.subsample,
                  filter_dilation=node.op.filter_dilation,
-                  num_groups=node.op.num_groups)(img, kern)
-
+                  num_groups=node.op.num_groups,
+                  unshared=node.op.unshared)(img, kern)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -134,12 +136,15 @@ def local_abstractconv_gradweight_gemm(node):
    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
                              subsample=node.op.subsample,
                              filter_dilation=node.op.filter_dilation,
-                              num_groups=node.op.num_groups)(img, topgrad, shape)
+                              num_groups=node.op.num_groups,
+                              unshared=node.op.unshared)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (rval.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        rval = rval[flip]
    rval = theano.tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
    copy_stack_trace(node.outputs[0], rval)

@@ -189,12 +194,14 @@ def local_abstractconv_gradinputs_gemm(node):

    # need to flip the kernel if necessary
    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1]
+        flip = (slice(None),) * (kern.ndim - 2) + \
+            (slice(None, None, -1),) * 2
+        kern = kern[flip]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
                             subsample=node.op.subsample,
                             filter_dilation=node.op.filter_dilation,
-                             num_groups=node.op.num_groups)(kern, topgrad,
-                                                            shape)
+                             num_groups=node.op.num_groups,
+                             unshared=node.op.unshared)(kern, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -242,7 +249,7 @@ def local_conv2d_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    rval = conv2d(img, kern,
@@ -270,7 +277,7 @@ def local_conv2d_gradweight_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    if node.op.border_mode == 'valid' and \
@@ -370,7 +377,7 @@ def local_conv2d_gradinputs_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
-    if node.op.num_groups > 1:
+    if node.op.num_groups > 1 or node.op.unshared:
        return None

    # Conv 3d implementation, needed when subsample > 2

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -1744,3 +1744,154 @@ class Separable_conv(unittest.TestCase):
        fun = theano.function([x_sym, dfilter_sym, pfilter_sym], sep_op, mode='FAST_RUN')
        top = fun(x[:, :, :3, :3, :3], depthwise_filter, pointwise_filter)
        utt.assert_allclose(top, precomp_output)
+
+
+class TestUnsharedConv(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+
+    mode = theano.compile.mode.Mode(optimizer='None')
+
+    def setUp(self):
+        self.img_shape = [(2, 2, 4, 4), (3, 2, 4, 2), (3, 3, 5, 3), (3, 4, 4, 4)]
+        self.kern_shape = [(2, 2, 2, 2, 3, 3), (2, 4, 2, 2, 4, 2), (3, 2, 1, 1, 3, 3), (4, 3, 3, 2, 4, 2)]
+        self.topgrad_shape = [(2, 2, 2, 2), (3, 2, 4, 2), (3, 3, 2, 1), (3, 4, 3, 3)]
+        self.border_mode = ['valid', 'full', 'valid', 'full']
+        self.subsample = [(1, 1), (2, 2), (2, 1), (3, 2)]
+        self.filter_dilation = (1, 1)
+        self.num_groups = [1, 1, 3, 2]
+
+        # self.verify_flags = np.random.choice([True, False], 4, [0.5, 0.5])
+        # Above line can be used instead if speed is a concern
+        self.verify_flags = [True] * 4
+
+        self.ref_mode = 'FAST_RUN'
+        if theano.config.cxx == "":
+            raise SkipTest("CorrMM needs cxx")
+
+    def test_fwd(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = tensor6('kern')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+
+        for imshp, kshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.border_mode,
+                                                          self.subsample, self.num_groups, self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                           filter_dilation=self.filter_dilation,
+                                           num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, kern_sym)
+            unshared_func = theano.function([img_sym, kern_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, kern)
+
+            single_kshp = kshp[:1] + kshp[3:]
+
+            ref_conv_op = self.conv2d(border_mode=mode, subsample=sub,
+                                      filter_dilation=self.filter_dilation,
+                                      num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, ref_kern_sym)
+            ref_func = theano.function([img_sym, ref_kern_sym], ref_out_sym, mode=self.mode)
+
+            for i in range(0, kshp[1]):
+                for j in range(0, kshp[2]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    ref_val = ref_func(img, single_kern)
+                    utt.assert_allclose(ref_val[:, :, i, j], unshared_output[:, :, i, j])
+
+            if verify:
+                utt.verify_grad(unshared_conv_op, [img, kern], mode=self.mode, eps=1)
+
+    def test_gradweight(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(img_sym, top_sym, tensor.as_tensor_variable(kshp[-2:]))
+            unshared_func = theano.function([img_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(img, top)
+
+            single_kshp = kshp[:1] + kshp[3:]
+
+            ref_conv_op = self.conv2d_gradw(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(img_sym, top_sym, tensor.as_tensor_variable(single_kshp[-2:]))
+            ref_func = theano.function([img_sym, top_sym], ref_out_sym, mode=self.mode)
+
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output = ref_func(img, top_single)
+                    utt.assert_allclose(unshared_output[:, i, j, ...], ref_output)
+
+            def conv_gradweight(inputs_val, output_val):
+                return unshared_conv_op(inputs_val, output_val, tensor.as_tensor_variable(kshp[-2:]))
+
+            if verify:
+                utt.verify_grad(conv_gradweight, [img, top], mode=self.mode, eps=1)
+
+    def test_gradinput(self):
+        tensor6 = theano.tensor.TensorType(theano.config.floatX, (False,) * 6)
+        kern_sym = tensor6('kern')
+        top_sym = theano.tensor.tensor4('top')
+        ref_kern_sym = theano.tensor.tensor4('ref_kern')
+
+        for imshp, kshp, topshp, mode, sub, groups, verify in zip(self.img_shape, self.kern_shape, self.topgrad_shape,
+                                                                  self.border_mode, self.subsample, self.num_groups,
+                                                                  self.verify_flags):
+            single_kshp = kshp[:1] + kshp[3:]
+
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            top = np.random.random(topshp).astype(theano.config.floatX)
+
+            unshared_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                                 filter_dilation=self.filter_dilation,
+                                                 num_groups=groups, unshared=True)
+            unshared_out_sym = unshared_conv_op(kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            unshared_func = theano.function([kern_sym, top_sym], unshared_out_sym, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                        for node in unshared_func.maker.fgraph.toposort()])
+            unshared_output = unshared_func(kern, top)
+
+            ref_conv_op = self.conv2d_gradi(border_mode=mode, subsample=sub,
+                                            filter_dilation=self.filter_dilation,
+                                            num_groups=groups, unshared=False)
+            ref_out_sym = ref_conv_op(ref_kern_sym, top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            ref_func = theano.function([ref_kern_sym, top_sym], ref_out_sym, mode=self.mode)
+
+            ref_output = np.zeros(imshp)
+
+            for i in range(0, topshp[2]):
+                for j in range(0, topshp[3]):
+                    single_kern = kern[:, i, j, ...].reshape(single_kshp)
+                    top_single = np.zeros_like(top)
+                    top_single[:, :, i, j] = top[:, :, i, j]
+                    ref_output += ref_func(single_kern, top_single)
+
+            utt.assert_allclose(ref_output, unshared_output)
+
+            def conv_gradinputs(filters_val, output_val):
+                return unshared_conv_op(filters_val, output_val, tensor.as_tensor_variable(imshp[-2:]))
+
+            if verify:
+                utt.verify_grad(conv_gradinputs, [kern, top], mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,7 +10,7 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
-from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim, TestUnsharedConv


 class TestCorr2D(utt.InferShapeTester):
@@ -452,6 +452,16 @@ class TestGroupCorr2d(Grouped_conv_noOptim):
        utt.assert_allclose(gconv_output, conv_output)


+class TestUnsharedCorr2d(TestUnsharedConv):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN").excluding('gpuarray')
+    else:
+        mode = None
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+
+
 if __name__ == '__main__':

    t = TestCorr2D('setUp')