Merge pull request #5991 from affanv14/group

Implement Grouped Convolutions

Merge pull request #5991 from affanv14/group
c2e14ce1 · abergeron · GitHub · 110729fb · 99758e6d · c2e14ce1
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -496,13 +496,16 @@ class BaseGpuCorrMM(CGpuKernelBase):
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
        Perform subsampling of the input, also known as dilation (default: (1, 1)).
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately (default : 1).
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
    _f16_ok = True

    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1), num_groups=1):
        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
        if isinstance(border_mode, tuple):
@@ -521,6 +524,9 @@ class BaseGpuCorrMM(CGpuKernelBase):
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
+        if num_groups < 1:
+            raise ValueError("Number of groups should be greater than 0")
+        self.num_groups = num_groups
        CGpuKernelBase.__init__(self, ['corr_gemm.c'])

    @property
@@ -530,11 +536,17 @@ class BaseGpuCorrMM(CGpuKernelBase):
        return (0, 0)

    def __str__(self):
-        return '%s{%s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
-            str(self.filter_dilation))
+            str(self.filter_dilation),
+            str(self.num_groups))
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'num_groups'):
+            self.num_groups = 1

    def flops(self, inp, outp):
        """
@@ -562,7 +574,7 @@ class BaseGpuCorrMM(CGpuKernelBase):

    def c_code_cache_version(self):
        # Raise this whenever modifying the C code (including the file).
-        return (8,)
+        return (9,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -609,6 +621,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        """
        dH, dW = self.subsample
        dilH, dilW = self.filter_dilation
+        numgroups = self.num_groups
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -669,6 +682,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    size_t dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;
+    int numgroups = %(numgroups)s;

    PyGpuArrayObject * bottom = %(bottom)s;
    PyGpuArrayObject * weights = %(weights)s;
@@ -768,7 +782,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        out_dim[1] = PyGpuArray_DIMS(bottom)[1];
+        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
        out_dim[2] = kH;  // already inferred further above
        out_dim[3] = kW;  // how convenient
        out_typecode = top->ga.typecode;
@@ -792,7 +806,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[1];
+        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
@@ -836,7 +850,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }

    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
    if (out2==NULL){
       %(fail)s
    }
@@ -873,6 +887,11 @@ class GpuCorrMM(BaseGpuCorrMM):
        The filter dilation operation applied to each input image.
        Should be a tuple with 2 elements.
        Set to `(1, 1)` to disable filter dilation.
+    num_groups
+        The number of distinct groups the image and kernel must be
+        divided into.
+        should be an int
+        set to 1 to disable grouped convolution

    Notes
    -----
@@ -892,9 +911,9 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1), num_groups=1):
        super(GpuCorrMM, self).__init__(border_mode, subsample,
-                                        filter_dilation)
+                                        filter_dilation, num_groups)

    def make_node(self, img, kern):
        ctx_name = infer_context_name(img, kern)
@@ -923,11 +942,13 @@ class GpuCorrMM(BaseGpuCorrMM):
        top = gpu_contiguous(top)
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
-                                        self.filter_dilation)(
+                                        self.filter_dilation,
+                                        self.num_groups)(
            weights, top, bottom.shape[-2:])
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
-                                          self.filter_dilation)(
+                                          self.filter_dilation,
+                                          self.num_groups)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights

@@ -945,10 +966,11 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
                                                    subsample,
-                                                    filter_dilation)
+                                                    filter_dilation, num_groups)

    def make_node(self, img, topgrad, shape=None):
        ctx_name = infer_context_name(img, topgrad)
@@ -987,11 +1009,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        weights = gpu_contiguous(weights)
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
-                                        self.filter_dilation)(weights,
-                                                              top,
-                                                              bottom.shape[-2:])
+                                        self.filter_dilation,
+                                        self.num_groups)(weights,
+                                                         top,
+                                                         bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1017,9 +1040,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):

    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
-                                                   filter_dilation)
+                                                   filter_dilation, num_groups)

    def make_node(self, kern, topgrad, shape=None):
        ctx_name = infer_context_name(kern, topgrad)
@@ -1038,8 +1062,12 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0

-        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
-                         False, False]
+        if self.num_groups > 1:
+            broadcastable = [topgrad.type.broadcastable[0], False,
+                             False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                             False, False]
        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
                                                                         context_name=ctx_name,
                                                                         broadcastable=broadcastable)()])
@@ -1057,12 +1085,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        bottom = gpu_contiguous(bottom)
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
-                                          self.filter_dilation)(bottom,
-                                                                top,
-                                                                weights.shape[-2:])
+                                          self.filter_dilation,
+                                          self.num_groups)(bottom,
+                                                           top,
+                                                           weights.shape[-2:])
        d_top = GpuCorrMM(self.border_mode,
                          self.subsample,
-                          self.filter_dilation)(bottom, weights)
+                          self.filter_dilation,
+                          self.num_groups)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dilH = 1,
                         const size_t dilW = 1,
                         const size_t padH = 0,
-                         const size_t padW = 0)
+                         const size_t padW = 0,
+                         const size_t numgroups = 1)
 {
    if (PyGpuArray_NDIM(bottom) != 4)
    {
@@ -411,7 +412,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
-    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
+    if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }

    // Define some useful variables
-    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
-    const size_t K_ = col_dim[0];
+    const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
+    const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
+    const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
+    const size_t K_ = col_dim[0] / numgroups;
    const size_t N_ = col_dim[1];
-    const size_t M_ = nFilters;
+    const size_t group_col_stride = (K_ * N_);
+    const size_t M_ = nFilters / numgroups;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
@@ -493,21 +498,23 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-	  err = im2col(&bottom->ga, n * bottom_stride,
-		       nChannels, bottomHeight,
-		       bottomWidth, kH, kW, dilH, dilW,
-		       padH, padW, dH, dW, &col->ga);
+            err = im2col(&bottom->ga, n * batch_bottom_stride,
+                         nChannels, bottomHeight,
+                         bottomWidth, kH, kW, dilH, dilW,
+                         padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
-            err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                        N_, M_, K_, 1,
-                        &col->ga, 0, N_,
-                        &weight->ga, 0, K_,
-                        0,
-                        &top->ga, n * top_stride, N_);
+            for (size_t g = 0; g < numgroups; g++){
+                err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                            N_, M_, K_, 1,
+                            &col->ga, g * group_col_stride, N_,
+                            &weight->ga, g * group_weight_stride, K_,
+                            0,
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM forward encountered an error running gemm: %d", err);
@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(&bottom->ga, n * bottom_stride,
+            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, &col->ga);
@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                        K_, M_, N_, 1,
-                        &col->ga, 0, N_,
-                        &top->ga, n * top_stride, N_,
-                        (n == 0) ? 0 : 1,
-                        &weight->ga, 0, K_);
+            for(size_t g = 0; g < numgroups; g++){ 
+                err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+                            K_, M_, N_, 1,
+                            &col->ga, g * group_col_stride, N_,
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                            (n == 0) ? 0 : 1,
+                            &weight->ga, g * group_weight_stride, K_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
@@ -575,13 +584,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
-          // gemm into columns
-          err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                      N_, K_, M_, 1,
-                      &top->ga, n * top_stride, N_,
-                      &weight->ga, 0, K_,
-                      0,
-                      &col->ga, 0, N_);
+            // gemm into columns
+            for(size_t g = 0; g < numgroups; g++){
+              err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+                          N_, K_, M_, 1,
+                          &top->ga, n * batch_top_stride + g * group_top_stride, N_,
+                          &weight->ga, g * group_weight_stride, K_,
+                          0,
+                          &col->ga, g * group_col_stride, N_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
-                         dH, dW, &bottom->ga, n * bottom_stride);
+                         dH, dW, &bottom->ga, n * batch_bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -503,18 +503,22 @@ class GpuDnnConv(DnnBase):
    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'winograd', 'guess_once',
            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_fwd`.
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately

    """
    _f16_ok = True
-    __props__ = ('algo', 'inplace')
+    __props__ = ('algo', 'inplace', 'num_groups')

    check_input = False
    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionFwdAlgo_t,
                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
                             inplace=bool_t,
-                             handle=handle_type)
+                             handle=handle_type,
+                             num_groups=int_t)

-    def __init__(self, algo=None, inplace=False):
+    def __init__(self, algo=None, inplace=False, num_groups=1):
        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
                         "APPLY_SPECIFIC(conv_fwd)")

@@ -534,6 +538,7 @@ class GpuDnnConv(DnnBase):
        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
+        self.num_groups = num_groups

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -544,6 +549,8 @@ class GpuDnnConv(DnnBase):
                self.algo = config.dnn.conv.algo_fwd
        if not hasattr(self, 'inplace'):
            self.inplace = False
+        if not hasattr(self, 'num_groups'):
+            self.num_groups = 1

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        ctx_name = infer_context_name(img, kern, output)
@@ -567,6 +574,8 @@ class GpuDnnConv(DnnBase):
                                                    SUPPORTED_DNN_CONV_ALGO_RUNTIME):
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))
+        if img.type.ndim == 5 and self.num_groups != 1:
+            raise ValueError("Grouped convolutions not implemented for 3D convolutions")

        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
@@ -584,8 +593,8 @@ class GpuDnnConv(DnnBase):

        top = gpu_contiguous(top)

-        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
-        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
+        d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc)
+        d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -637,18 +646,22 @@ class GpuDnnConvGradW(DnnBase):
    algo : {'none', 'deterministic', 'fft', 'small', 'guess_once',
            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately

    """
    _f16_ok = True
-    __props__ = ('algo', 'inplace')
+    __props__ = ('algo', 'inplace', 'num_groups')

    check_input = False
    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdFilterAlgo_t,
                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
                             inplace=bool_t,
-                             handle=handle_type)
+                             handle=handle_type,
+                             num_groups=int_t)

-    def __init__(self, inplace=False, algo=None):
+    def __init__(self, inplace=False, algo=None, num_groups=1):
        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
                         "APPLY_SPECIFIC(conv_gw)")
        self.inplace = bool(inplace)
@@ -666,6 +679,7 @@ class GpuDnnConvGradW(DnnBase):
        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
+        self.num_groups = num_groups

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -673,6 +687,8 @@ class GpuDnnConvGradW(DnnBase):
            self.inplace = False
        if not hasattr(self, 'algo'):
            self.algo = config.dnn.conv.algo_bwd_filter
+        if not hasattr(self, 'num_groups'):
+            self.num_groups = 1

    def grad(self, inp, grads):
        img, top, output, desc, alpha, beta = inp
@@ -680,8 +696,8 @@ class GpuDnnConvGradW(DnnBase):

        kerns = gpu_contiguous(kerns)

-        d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
-        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_img = GpuDnnConvGradI(num_groups=self.num_groups)(kerns, top, empty_like(img), desc)
+        d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -766,18 +782,22 @@ class GpuDnnConvGradI(DnnBase):
    algo : {'none', 'deterministic', 'fft', 'fft_tiling', 'winograd', 'guess_once',
            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately

    """
    _f16_ok = True
-    __props__ = ('algo', 'inplace',)
+    __props__ = ('algo', 'inplace', 'num_groups')

    check_input = False
    params_type = ParamsType(conv_algo=cudnn.cudnnConvolutionBwdDataAlgo_t,
                             choose_algo=bool_t, choose_once=bool_t, choose_time=bool_t,
                             inplace=bool_t,
-                             handle=handle_type)
+                             handle=handle_type,
+                             num_groups=int_t)

-    def __init__(self, inplace=False, algo=None):
+    def __init__(self, inplace=False, algo=None, num_groups=1):
        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
                         "APPLY_SPECIFIC(conv_gi)")
        self.inplace = bool(inplace)
@@ -795,6 +815,7 @@ class GpuDnnConvGradI(DnnBase):
        self.choose_algo = self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
        self.choose_once = self.algo in DNN_CONV_ALGO_CHOOSE_ONCE
        self.choose_time = self.algo in DNN_CONV_ALGO_CHOOSE_TIME
+        self.num_groups = num_groups

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -802,6 +823,8 @@ class GpuDnnConvGradI(DnnBase):
            self.algo = config.dnn.conv.algo_bwd_data
        if not hasattr(self, 'inplace'):
            self.inplace = False
+        if not hasattr(self, 'num_groups'):
+            self.num_groups = 1

    def grad(self, inp, grads):
        kerns, top, output, desc, alpha, beta = inp
@@ -809,8 +832,8 @@ class GpuDnnConvGradI(DnnBase):

        img = gpu_contiguous(img)

-        d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
-        d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
+        d_kerns = GpuDnnConvGradW(num_groups=self.num_groups)(img, top, empty_like(kerns), desc)
+        d_top = GpuDnnConv(num_groups=self.num_groups)(img, kerns, empty_like(top), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -859,7 +882,7 @@ class GpuDnnConvGradI(DnnBase):

 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
             conv_mode='conv', direction_hint=None, workmem=None,
-             algo=None, precision=None):
+             algo=None, precision=None, num_groups=1):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -902,6 +925,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
        should be done. Possible values are 'as_input', 'float16', 'float32'
        and 'float64'. Default is the value of
        :attr:`config.dnn.conv.precision`.
+    num_groups :
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately


    .. warning:: The cuDNN library only works with GPUs that have a compute
@@ -977,7 +1003,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
                                    filter_dilation=dilation)
    out_shp = assert_conv_shape(out_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
-    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
+    return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, out, desc)


 def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
@@ -1101,7 +1127,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1


 def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
-                   subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
+                   subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
+                   precision=None, algo=None, num_groups=1):
    """
    TODO: document this
    """
@@ -1116,7 +1143,7 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns_shp)
    out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
-    return GpuDnnConvGradW()(img, topgrad, out, desc)
+    return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, out, desc)


 def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
@@ -1129,7 +1156,8 @@ def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',


 def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
-                  subsample=(1, 1), dilation=(1, 1), conv_mode='conv', precision=None):
+                  subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
+                  precision=None, algo=None, num_groups=1):
    """
    TODO: document this
    """
@@ -1144,7 +1172,7 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
                          conv_mode=conv_mode, precision=precision)(kerns.shape)
    out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
-    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
+    return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, out, desc)


 def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
@@ -2736,7 +2764,8 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
                        subsample=op.subsample,
                        dilation=op.filter_dilation,
                        direction_hint='forward!',
-                        conv_mode=conv_mode)
+                        conv_mode=conv_mode,
+                        num_groups=op.num_groups)
    elif isinstance(op, AbstractConv2d_gradWeights):
        shape = (inp2.shape[1], inp1.shape[1],
                 inputs[2][0], inputs[2][1])
@@ -2744,7 +2773,8 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
                              border_mode=op.border_mode,
                              subsample=op.subsample,
                              dilation=op.filter_dilation,
-                              conv_mode=conv_mode)
+                              conv_mode=conv_mode,
+                              num_groups=op.num_groups)
    elif isinstance(op, AbstractConv2d_gradInputs):
        shape = (inp2.shape[0], inp1.shape[1],
                 inputs[2][0], inputs[2][1])
@@ -2752,7 +2782,8 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
                             border_mode=op.border_mode,
                             subsample=op.subsample,
                             dilation=op.filter_dilation,
-                             conv_mode=conv_mode)
+                             conv_mode=conv_mode,
+                             num_groups=op.num_groups)
    return [rval]


@@ -2837,17 +2868,17 @@ def local_abstractconv_gi_cudnn(node):

 @inplace_allocempty(GpuDnnConv, 2)
 def local_dnn_conv_inplace(node, inputs):
-    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]


 @inplace_allocempty(GpuDnnConvGradW, 2)
 def local_dnn_convgw_inplace(node, inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]


 @inplace_allocempty(GpuDnnConvGradI, 2)
 def local_dnn_convgi_inplace(node, inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True, num_groups=node.op.num_groups)(*inputs)]

 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
@@ -2860,19 +2891,19 @@ optdb.register('local_dnna_conv_inplace',
 @register_opt('cudnn')
 @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
 def local_dnn_conv_alpha_merge(node, *inputs):
-    return [GpuDnnConv(algo=node.op.algo)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]


 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
 def local_dnn_convw_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]


 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
 def local_dnn_convi_alpha_merge(node, *inputs):
-    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)]


 @register_opt('cudnn')

--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
 #section support_code

 static int
-c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+c_set_tensor_for_conv(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc, size_t groups) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
@@ -42,7 +42,8 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
    strs[i] = 1;
    dims[i] = 1;
  }
-
+  //only for grouped convolution i.e when groups > 1
+  dims[1] = dims[1] / groups;
  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd < 3 ? 3 : nd,
                                                 dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -54,6 +55,11 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  return 0;
 }

+static int
+c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+ return c_set_tensor_for_conv(var, desc, 1);
+}
+
 static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc) {
  cudnnStatus_t err;
  err = cudnnCreateTensorDescriptor(desc);
@@ -71,7 +77,7 @@ static int c_make_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t *desc)
 }

 static int
-c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
+c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc, size_t groups) {
  cudnnDataType_t dt;
  cudnnStatus_t err;

@@ -111,6 +117,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  /* Filters can't be less than 3d so we pad */
  for (unsigned int i = nd; i < 3; i++)
    dims[i] = 1;
+  dims[0] = dims[0] / groups;

  if (nd < 3)
    nd = 3;
@@ -135,7 +142,7 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
                 cudnnGetErrorString(err));
    return -1;
  }
-  if (c_set_filter(var, *desc) != 0) {
+  if (c_set_filter(var, *desc, 1) != 0) {
    cudnnDestroyFilterDescriptor(*desc);
    return -1;
  }

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -29,7 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
 		    "images and kernel must have the same stack size");
    return 1;
@@ -72,12 +72,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    return 0;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
    return 1;
-  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1)
    return 1;
+  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
+  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
+  size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;

  cudnnConvolutionFwdAlgo_t algo = params->conv_algo;

@@ -281,15 +284,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

+    for ( int g = 0; g < params->num_groups; g++) {
    err = cudnnConvolutionForward(
      params->handle,
      alpha_p,
-      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g,
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
-      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output) + output_offset * g);
+    }

    if (worksize != 0)
      gpudata_release(workspace);

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

-  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
+  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
                    "stack size");
    return 1;
@@ -71,12 +71,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    return 0;
  }

-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
    return 1;
-  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1)
    return 1;
+  size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups;
+  size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
+  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;

  cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;

@@ -93,7 +96,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  }
  if (PyGpuArray_NDIM(im) == 4) {
    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
@@ -286,14 +289,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  err = cudnnConvolutionBackwardData(
-    params->handle,
-    alpha_p,
-    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
-    beta_p,
-    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+  for ( int g = 0; g < params->num_groups; g++)
+  {
+    err = cudnnConvolutionBackwardData(
+      params->handle,
+      alpha_p,
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns) + kern_offset * g,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
+      desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
+      beta_p,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input) + input_offset * g);
+  }

  if (worksize != 0)
    gpudata_release(workspace);

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -28,7 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

-  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
+  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuDnnConv images and kernel must have the same stack size");
    return 1;
@@ -71,13 +71,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 0;
  }

-  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1)
    return 1;
-  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
    return 1;
-  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1)
    return 1;

+  size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups;
+  size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups;
+  size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
+
  cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;

  cuda_enter(c->ctx);
@@ -93,7 +97,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  }
  if (PyGpuArray_NDIM(input) == 4) {
    if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
-        (PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
+        (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
        (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
        (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
      PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
@@ -273,14 +277,18 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

-  err = cudnnConvolutionBackwardFilter(
-    params->handle,
-    alpha_p,
-    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
-    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
-    beta_p,
-    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+  for ( int g = 0; g < params->num_groups; g++)
+  {
+
+    err = cudnnConvolutionBackwardFilter(
+      params->handle,
+      alpha_p,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input) + input_offset * g ,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output) + output_offset * g,
+      desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
+      beta_p,
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns) + kern_offset * g);
+  }

  if (worksize != 0)
    gpudata_release(workspace);

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1533,7 +1533,8 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
-    if ((border_mode == 'full') and (subsample == (1, 1))):
+
+    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # need to dimshuffle the kernel for full convolution
@@ -1550,8 +1551,9 @@ def local_abstractconv_gemm(node):
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
-                         filter_dilation)(gpu_contiguous(img),
-                                          gpu_contiguous(kern))
+                         filter_dilation,
+                         node.op.num_groups)(gpu_contiguous(img),
+                                             gpu_contiguous(kern))

        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1669,7 +1671,8 @@ def local_abstractconv_gradweights_gemm(node):

    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
-                                 filter_dilation=node.op.filter_dilation)(
+                                 filter_dilation=node.op.filter_dilation,
+                                 num_groups=node.op.num_groups)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
    if node.op.filter_flip:
        rval = rval[:, :, ::-1, ::-1]
@@ -1713,7 +1716,8 @@ def local_abstractconv_gradinputs_gemm(node):

    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
-                                filter_dilation=node.op.filter_dilation)(
+                                filter_dilation=node.op.filter_dilation,
+                                num_groups=node.op.num_groups)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]


--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -25,6 +25,7 @@ from . import test_nnet
 from .rnn_support import Model, GRU, LSTM, WrapperLayer

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim

 try:
    import pygpu
@@ -2263,3 +2264,37 @@ def test_dnn_rnn_lstm_grad_c():
                                           (i + 1) * len(cudnn_grads_layer)]
        for j, g in enumerate(cudnn_grads_layer):
            utt.assert_allclose(ref_grads_layer[j], g)
+
+
+def dconv2d(border_mode, subsample, filter_dilation, num_groups):
+    def dconv(img, kern):
+        return dnn.dnn_conv(img, kern, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
+                            conv_mode='conv', direction_hint='forward', workmem=None,
+                            algo=None, precision=None, num_groups=num_groups)
+    return dconv
+
+
+def dconv2dw(border_mode, subsample, filter_dilation, num_groups):
+    def dconvw(img, topgrad, kshp):
+        return dnn.dnn_gradweight(img, topgrad, kshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
+                                  conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
+    return dconvw
+
+
+def dconv2di(border_mode, subsample, filter_dilation, num_groups):
+    def dconvi(kern, topgrad, imshp):
+        return dnn.dnn_gradinput(kern, topgrad, imshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
+                                 conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
+    return dconvi
+
+
+class Cudnn_grouped_conv(Grouped_conv_noOptim):
+    mode = mode_with_gpu
+    conv2d = staticmethod(dconv2d)
+    conv2d_gradw = staticmethod(dconv2dw)
+    conv2d_gradi = staticmethod(dconv2di)
+    conv2d_op = dnn.GpuDnnConv
+    conv2d_gradw_op = dnn.GpuDnnConvGradW
+    conv2d_gradi_op = dnn.GpuDnnConvGradI
+    flip_filter = False
+    is_dnn = True
--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -11,6 +11,7 @@ from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInput
 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from .config import mode_with_gpu, mode_without_gpu, ref_cast
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim


 class TestCorrMM(unittest.TestCase):
@@ -219,3 +220,15 @@ class TestCorrMM(unittest.TestCase):
                            verify_grad=False)
        self.run_gradinput(inputs_shape=(1, 1024, 3, 1),
                           filters_shape=(1, 1, 1, 1024))
+
+
+class TestGroupGpuCorr2d(Grouped_conv_noOptim):
+    mode = theano.compile.get_mode("FAST_RUN")
+    conv2d = GpuCorrMM
+    conv2d_gradw = GpuCorrMM_gradWeights
+    conv2d_gradi = GpuCorrMM_gradInputs
+    conv2d_op = GpuCorrMM
+    conv2d_gradw_op = GpuCorrMM_gradWeights
+    conv2d_gradi_op = GpuCorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -39,7 +39,7 @@ from .abstract_conv import conv3d

 def conv2d(input, filters, input_shape=None, filter_shape=None,
           border_mode='valid', subsample=(1, 1), filter_flip=True,
-           image_shape=None, filter_dilation=(1, 1), **kwargs):
+           image_shape=None, filter_dilation=(1, 1), num_groups=1, **kwargs):
    """
    This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
@@ -103,6 +103,10 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,
        Factor by which to subsample (stride) the input.
        Also called dilation elsewhere.

+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately
+
    kwargs: Any other keyword arguments are accepted for backwards
            compatibility, but will be ignored.

@@ -152,12 +156,12 @@ def conv2d(input, filters, input_shape=None, filter_shape=None,

    return abstract_conv2d(input, filters, input_shape, filter_shape,
                           border_mode, subsample, filter_flip,
-                           filter_dilation)
+                           filter_dilation, num_groups)


 def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                     border_mode='valid', input_dilation=(1, 1),
-                     filter_flip=True, filter_dilation=(1, 1)):
+                     filter_flip=True, filter_dilation=(1, 1), num_groups=1):
    """
    This function will build the symbolic graph for applying a transposed
    convolution over a mini-batch of a stack of 2D inputs with a set of 2D
@@ -209,6 +213,10 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
        Factor by which to subsample (stride) the input.
        Also called dilation elsewhere.

+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately
+
    Returns
    -------
    Symbolic 4D tensor
@@ -235,4 +243,5 @@ def conv2d_transpose(input, filters, output_shape, filter_shape=None,
                                  border_mode=border_mode,
                                  subsample=input_dilation,
                                  filter_flip=filter_flip,
-                                  filter_dilation=filter_dilation)
+                                  filter_dilation=filter_dilation,
+                                  num_groups=num_groups)
--- a/theano/tensor/nnet/abstract_conv.py
+++ b/theano/tensor/nnet/abstract_conv.py
@@ -66,7 +66,6 @@ def get_conv_output_shape(image_shape, kernel_shape,
    """
    bsize, imshp = image_shape[0], image_shape[2:]
    nkern, kshp = kernel_shape[0], kernel_shape[2:]
-
    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')

@@ -139,7 +138,8 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,

 def get_conv_gradweights_shape(image_shape, top_shape,
                               border_mode, subsample,
-                               filter_dilation=None):
+                               filter_dilation=None,
+                               num_groups=1):
    """
    This function tries to compute the kernel shape of convolution gradWeights.

@@ -167,6 +167,8 @@ def get_conv_gradweights_shape(image_shape, top_shape,
    filter_dilation: tuple of int (symbolic or numeric). Its two or three
        elements correspond respectively to the dilation on height and
        width axis.
+    num_groups: An int which specifies the number of separate groups to
+        be divided into.

    Returns
    -------
@@ -181,6 +183,9 @@ def get_conv_gradweights_shape(image_shape, top_shape,

    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')
+    if num_groups > 1:
+        assert len(subsample) == 2
+        nchan = nchan // num_groups

    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_gradweights_shape_1axis(
@@ -245,7 +250,8 @@ def get_conv_gradweights_shape_1axis(image_shape, top_shape, border_mode,

 def get_conv_gradinputs_shape(kernel_shape, top_shape,
                              border_mode, subsample,
-                              filter_dilation=None):
+                              filter_dilation=None,
+                              num_groups=1):
    """
    This function tries to compute the image shape of convolution gradInputs.

@@ -273,6 +279,8 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,
    filter_dilation: tuple of int (symbolic or numeric). Its two or three
        elements correspond respectively to the dilation on height and
        width axis.
+    num_groups: An int which specifies the number of separate groups to
+        be divided into.

    Returns
    -------
@@ -286,6 +294,9 @@ def get_conv_gradinputs_shape(kernel_shape, top_shape,

    if filter_dilation is None:
        filter_dilation = np.ones(len(subsample), dtype='int')
+    if num_groups > 1:
+        assert len(subsample) == 2
+        nkern = nkern * num_groups

    if isinstance(border_mode, tuple):
        out_shp = tuple(get_conv_gradinputs_shape_1axis(
@@ -512,7 +523,8 @@ def conv2d(input,
           border_mode='valid',
           subsample=(1, 1),
           filter_flip=True,
-           filter_dilation=(1, 1)):
+           filter_dilation=(1, 1),
+           num_groups=1):
    """This function will build the symbolic graph for convolving a mini-batch of a
    stack of 2D inputs with a set of 2D filters. The implementation is modelled
    after Convolutional Neural Networks (CNN).
@@ -527,7 +539,8 @@ def conv2d(input,
                             border_mode=border_mode,
                             subsample=subsample,
                             filter_flip=filter_flip,
-                             filter_dilation=filter_dilation)
+                             filter_dilation=filter_dilation,
+                             num_groups=num_groups)
    return conv_op(input, filters)


@@ -637,7 +650,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                           border_mode='valid',
                           subsample=(1, 1),
                           filter_flip=True,
-                           filter_dilation=(1, 1)):
+                           filter_dilation=(1, 1),
+                           num_groups=1):
    """Compute conv output gradient w.r.t its inputs

    This function builds the symbolic graph for getting the
@@ -710,6 +724,9 @@ def conv2d_grad_wrt_inputs(output_grad,
    filter_dilation : tuple of len 2
        The filter dilation used in the forward pass.
        Also known as input striding.
+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately

    Returns
    -------
@@ -760,7 +777,8 @@ def conv2d_grad_wrt_inputs(output_grad,
                                              border_mode=border_mode,
                                              subsample=subsample,
                                              filter_flip=filter_flip,
-                                              filter_dilation=filter_dilation)
+                                              filter_dilation=filter_dilation,
+                                              num_groups=num_groups)

    return grad_input_op(filters, output_grad, input_shape[-2:])

@@ -907,7 +925,8 @@ def conv2d_grad_wrt_weights(input,
                            border_mode='valid',
                            subsample=(1, 1),
                            filter_flip=True,
-                            filter_dilation=(1, 1)):
+                            filter_dilation=(1, 1),
+                            num_groups=1):
    """Compute conv output gradient w.r.t its weights

    This function will build the symbolic graph for getting the
@@ -972,6 +991,9 @@ def conv2d_grad_wrt_weights(input,
    filter_dilation : tuple of len 2
        The filter dilation used in the forward pass.
        Also known as input striding.
+    num_groups : int
+        Divides the image, kernel and output tensors into num_groups
+        separate groups. Each which carry out convolutions separately

    Returns
    -------
@@ -1022,7 +1044,8 @@ def conv2d_grad_wrt_weights(input,
                                               border_mode=border_mode,
                                               subsample=subsample,
                                               filter_flip=filter_flip,
-                                               filter_dilation=filter_dilation)
+                                               filter_dilation=filter_dilation,
+                                               num_groups=num_groups)

    return gradWeight_op(input, output_grad, filter_shape[-2:])

@@ -1392,11 +1415,11 @@ class BaseAbstractConv(Op):
    """
    check_broadcast = False
    __props__ = ('convdim', 'border_mode', 'subsample', 'filter_flip',
-                 'imshp', 'kshp', 'filter_dilation')
+                 'imshp', 'kshp', 'filter_dilation', 'num_groups')

    def __init__(self, convdim,
                 imshp=None, kshp=None, border_mode="valid",
-                 subsample=None, filter_flip=True, filter_dilation=None):
+                 subsample=None, filter_flip=True, filter_dilation=None, num_groups=1):

        self.convdim = convdim
        if convdim not in (2, 3):
@@ -1458,6 +1481,11 @@ class BaseAbstractConv(Op):
        if len(filter_dilation) != convdim:
            raise ValueError("filter_dilation must have {} elements".format(convdim))
        self.filter_dilation = tuple(filter_dilation)
+        if num_groups < 1:
+            raise ValueError("num_groups must have value greater than zero")
+        elif num_groups > 1 and convdim == 3:
+            raise ValueError("grouped convolution not supported for 3D convolutions")
+        self.num_groups = num_groups

    def do_constant_folding(self, node):
        # Disable constant folding since there is no implementation.
@@ -1471,20 +1499,20 @@ class BaseAbstractConv(Op):
            # flops for any direction, sampling, padding, and border mode
            inputs, filters = inp
            outputs, = outp
-            assert inputs[1] == filters[1]
+            assert inputs[1] == (filters[1] * self.num_groups)
            # nb mul and add by output pixel
            flops = filters[2] * filters[3] * 2
            # nb flops by output image
            flops *= outputs[2] * outputs[3]
            # nb patch multiplied
-            flops *= inputs[1] * filters[0] * inputs[0]
+            flops *= inputs[1] * filters[0] * inputs[0] / self.num_groups
            return flops
        else:
            # TODO implement for convdim == 3
            raise NotImplementedError(
                'flops not implemented for convdim={}', self.convdim)

-    def conv(self, img, kern, mode="valid", dilation=1):
+    def conv(self, img, kern, mode="valid", dilation=1, num_groups=1):
        """
        Basic slow Python 2D or 3D convolution for DebugMode
        """
@@ -1517,18 +1545,31 @@ class BaseAbstractConv(Op):
                     ] = kern

        if self.convdim == 2:
+            if img.shape[1] % self.num_groups != 0:
+                raise ValueError(
+                    'number of input channels must be divible by num_groups')
+            if kern.shape[0] % self.num_groups != 0:
+                raise ValueError(
+                    'number of filters must be divisible by num_groups')
+            if img.shape[1] // num_groups != kern.shape[1]:
+                raise ValueError(
+                    'the number of input channels in the kernel should '
+                    'specify the number of channels of 1 group')
            val = _valfrommode(mode)
            bval = _bvalfromboundary('fill')
+            input_channel_offset = img.shape[1] // self.num_groups
+            output_channel_offset = kern.shape[0] // self.num_groups

            with warnings.catch_warnings():
                warnings.simplefilter('ignore', np.ComplexWarning)
                for b in xrange(img.shape[0]):
-                    for n in xrange(kern.shape[0]):
-                        for im0 in xrange(img.shape[1]):
-                            # some cast generates a warning here
-                            out[b, n, ...] += _convolve2d(img[b, im0, ...],
-                                                          dilated_kern[n, im0, ...],
-                                                          1, val, bval, 0)
+                    for g in xrange(self.num_groups):
+                        for n in xrange(output_channel_offset):
+                            for im0 in xrange(input_channel_offset):
+                                # some cast generates a warning here
+                                out[b, g * output_channel_offset + n, ...] += _convolve2d(img[b, g * input_channel_offset + im0, ...],
+                                                                                          dilated_kern[g * output_channel_offset + n,
+                                                                                          im0, ...], 1, val, bval, 0)
        elif self.convdim == 3:
            for b in xrange(img.shape[0]):
                for n in xrange(kern.shape[0]):
@@ -1554,13 +1595,15 @@ class AbstractConv(BaseAbstractConv):
                 border_mode="valid",
                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=None):
+                 filter_dilation=None,
+                 num_groups=1):
        super(AbstractConv, self).__init__(convdim=convdim,
                                           imshp=imshp, kshp=kshp,
                                           border_mode=border_mode,
                                           subsample=subsample,
                                           filter_flip=filter_flip,
-                                           filter_dilation=filter_dilation)
+                                           filter_dilation=filter_dilation,
+                                           num_groups=num_groups)

    def make_node(self, img, kern):
        # Make sure both inputs are Variables with the same Type
@@ -1622,7 +1665,7 @@ class AbstractConv(BaseAbstractConv):
            img = new_img
        if not self.filter_flip:
            kern = kern[(slice(None), slice(None)) + (slice(None, None, -1),) * self.convdim]
-        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation)
+        conv_out = self.conv(img, kern, mode="valid", dilation=self.filter_dilation, num_groups=self.num_groups)
        conv_out = conv_out[(slice(None), slice(None)) +
                            tuple(slice(None, None, self.subsample[i])
                                  for i in range(self.convdim))]
@@ -1630,6 +1673,9 @@ class AbstractConv(BaseAbstractConv):
        o[0] = node.outputs[0].type.filter(conv_out)

    def R_op(self, inputs, eval_points):
+        if self.num_groups > 1:
+            raise NotImplementedError(
+                'Rop not implemented for grouped convolutions')
        rval = None
        if eval_points[0] is not None:
            rval = self.make_node(eval_points[0], inputs[1]).outputs[0]
@@ -1668,13 +1714,15 @@ class AbstractConv2d(AbstractConv):
                 border_mode="valid",
                 subsample=(1, 1),
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(AbstractConv2d, self).__init__(convdim=2,
                                             imshp=imshp, kshp=kshp,
                                             border_mode=border_mode,
                                             subsample=subsample,
                                             filter_flip=filter_flip,
-                                             filter_dilation=filter_dilation)
+                                             filter_dilation=filter_dilation,
+                                             num_groups=num_groups)

    def grad(self, inp, grads):
        bottom, weights = inp
@@ -1684,13 +1732,15 @@ class AbstractConv2d(AbstractConv):
                                             self.border_mode,
                                             self.subsample,
                                             self.filter_flip,
-                                             self.filter_dilation)(
+                                             self.filter_dilation,
+                                             num_groups=self.num_groups)(
            weights, top, bottom.shape[-2:], add_assert_shape=False)
        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
                                               self.border_mode,
                                               self.subsample,
                                               self.filter_flip,
-                                               self.filter_dilation)(
+                                               self.filter_dilation,
+                                               num_groups=self.num_groups)(

            bottom, top, weights.shape[-2:], add_assert_shape=False)

@@ -1772,13 +1822,15 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                 border_mode="valid",
                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=None):
+                 filter_dilation=None,
+                 num_groups=1):
        super(AbstractConv_gradWeights, self).__init__(convdim=convdim,
                                                       imshp=imshp, kshp=kshp,
                                                       border_mode=border_mode,
                                                       subsample=subsample,
                                                       filter_flip=filter_flip,
-                                                       filter_dilation=filter_dilation)
+                                                       filter_dilation=filter_dilation,
+                                                       num_groups=num_groups)

    # Update shape/height_width
    def make_node(self, img, topgrad, shape, add_assert_shape=True):
@@ -1856,7 +1908,19 @@ class AbstractConv_gradWeights(BaseAbstractConv):
                        (slice(None, None, -1),) * self.convdim)
        topgrad = topgrad.transpose(axes_order)[flip_filters]
        img = img.transpose(axes_order)
-        kern = self.conv(img, topgrad, mode="valid")
+
+        def correct_for_groups(mat):
+            mshp0 = mat.shape[0] // self.num_groups
+            mshp1 = mat.shape[1] * self.num_groups
+            mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
+            mat = mat.transpose((1, 0, 2, 3, 4))
+            mat = mat.reshape((mshp0, mshp1) + mat.shape[-2:])
+            return mat
+
+        if self.num_groups > 1:
+            img = correct_for_groups(img)
+
+        kern = self.conv(img, topgrad, mode="valid", num_groups=self.num_groups)
        if any(self.filter_dilation[i] > 1 for i in range(self.convdim)):
            kern = kern[(slice(None), slice(None)) +
                        tuple(slice(None, None, self.filter_dilation[i])
@@ -1878,8 +1942,12 @@ class AbstractConv_gradWeights(BaseAbstractConv):
        imshp = input_shapes[0]
        topshp = input_shapes[1]
        kshp = self.kshp[:] if self.kshp is not None else [None] * (2 + self.convdim)
-        fallback_kshp = ([topshp[1], imshp[1]] +
-                         [node.inputs[2][i] for i in range(self.convdim)])
+        if self.num_groups > 1:
+            fallback_kshp = ([topshp[1], imshp[1] // self.num_groups] +
+                             [node.inputs[2][i] for i in range(self.convdim)])
+        else:
+            fallback_kshp = ([topshp[1], imshp[1]] +
+                             [node.inputs[2][i] for i in range(self.convdim)])
        kshp = [fallback_kshp[i] if kshp[i] is None else kshp[i]
                for i in range(2 + self.convdim)]
        return [kshp]
@@ -1901,13 +1969,15 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                 border_mode="valid",
                 subsample=(1, 1),
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(AbstractConv2d_gradWeights, self).__init__(convdim=2,
                                                         imshp=imshp, kshp=kshp,
                                                         border_mode=border_mode,
                                                         subsample=subsample,
                                                         filter_flip=filter_flip,
-                                                         filter_dilation=filter_dilation)
+                                                         filter_dilation=filter_dilation,
+                                                         num_groups=num_groups)

    def grad(self, inp, grads):
        bottom, top = inp[:2]
@@ -1916,15 +1986,17 @@ class AbstractConv2d_gradWeights(AbstractConv_gradWeights):
                                             self.border_mode,
                                             self.subsample,
                                             self.filter_flip,
-                                             self.filter_dilation)(weights,
-                                                                   top,
-                                                                   bottom.shape[-2:])
+                                             self.filter_dilation,
+                                             self.num_groups)(weights,
+                                                              top,
+                                                              bottom.shape[-2:])
        d_top = AbstractConv2d(self.imshp,
                               self.kshp,
                               self.border_mode,
                               self.subsample,
                               self.filter_flip,
-                               self.filter_dilation)(bottom, weights)
+                               self.filter_dilation,
+                               self.num_groups)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.
@@ -2011,13 +2083,15 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                 border_mode="valid",
                 subsample=None,
                 filter_flip=True,
-                 filter_dilation=None):
+                 filter_dilation=None,
+                 num_groups=1):
        super(AbstractConv_gradInputs, self).__init__(convdim=convdim,
                                                      imshp=imshp, kshp=kshp,
                                                      border_mode=border_mode,
                                                      subsample=subsample,
                                                      filter_flip=filter_flip,
-                                                      filter_dilation=filter_dilation)
+                                                      filter_dilation=filter_dilation,
+                                                      num_groups=num_groups)

    # Update shape/height_width
    def make_node(self, kern, topgrad, shape, add_assert_shape=True):
@@ -2041,8 +2115,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
                                'filters does not match given kshp.')

        shape = as_tensor_variable(shape)
-        broadcastable = [topgrad.type.broadcastable[0],
-                         kern.type.broadcastable[1]] + ([False] * self.convdim)
+        if self.num_groups > 1:
+            broadcastable = [topgrad.type.broadcastable[0],
+                             False] + ([False] * self.convdim)
+        else:
+            broadcastable = [topgrad.type.broadcastable[0],
+                             kern.type.broadcastable[1]] + ([False] * self.convdim)
        output = kern.type.clone(broadcastable=broadcastable)()
        return Apply(self, [kern, topgrad, shape], [output])

@@ -2097,10 +2175,20 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        axes_order = (1, 0) + tuple(range(2, self.convdim + 2))
        flip_filters = ((slice(None), slice(None)) +
                        (slice(None, None, -1),) * self.convdim)
+
+        def correct_for_groups(mat):
+            mshp0 = mat.shape[0] // self.num_groups
+            mshp1 = mat.shape[1] * self.num_groups
+            mat = mat.reshape((self.num_groups, mshp0) + mat.shape[1:])
+            mat = mat.transpose((1, 0, 2, 3, 4))
+            mat = mat.reshape((mshp0, mshp1) + mat.shape[-2:])
+            return mat
+        kern = correct_for_groups(kern)
        kern = kern.transpose(axes_order)
+
        if self.filter_flip:
            topgrad = topgrad[flip_filters]
-        img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation)
+        img = self.conv(topgrad, kern, mode="full", dilation=self.filter_dilation, num_groups=self.num_groups)
        if self.filter_flip:
            img = img[flip_filters]
        if any(p > 0 for p in pad):
@@ -2120,8 +2208,12 @@ class AbstractConv_gradInputs(BaseAbstractConv):
        kshp = input_shapes[0]
        topshp = input_shapes[1]
        imshp = self.imshp[:] if self.imshp is not None else [None] * (2 + self.convdim)
-        fallback_imshp = ([topshp[0], kshp[1]] +
-                          [node.inputs[2][i] for i in range(self.convdim)])
+        if self.num_groups > 1:
+            fallback_imshp = ([topshp[0], kshp[1] * self.num_groups] +
+                              [node.inputs[2][i] for i in range(self.convdim)])
+        else:
+            fallback_imshp = ([topshp[0], kshp[1]] +
+                              [node.inputs[2][i] for i in range(self.convdim)])
        imshp = [fallback_imshp[i] if imshp[i] is None else imshp[i]
                 for i in range(2 + self.convdim)]
        return [imshp]
@@ -2144,13 +2236,15 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                 border_mode="valid",
                 subsample=(1, 1),
                 filter_flip=True,
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(AbstractConv2d_gradInputs, self).__init__(convdim=2,
                                                        imshp=imshp, kshp=kshp,
                                                        border_mode=border_mode,
                                                        subsample=subsample,
                                                        filter_flip=filter_flip,
-                                                        filter_dilation=filter_dilation)
+                                                        filter_dilation=filter_dilation,
+                                                        num_groups=num_groups)

    def grad(self, inp, grads):
        weights, top = inp[:2]
@@ -2159,14 +2253,16 @@ class AbstractConv2d_gradInputs(AbstractConv_gradInputs):
                                               self.border_mode,
                                               self.subsample,
                                               self.filter_flip,
-                                               self.filter_dilation)(
+                                               self.filter_dilation,
+                                               self.num_groups)(
                                                   bottom, top,
                                                   weights.shape[-2:])
        d_top = AbstractConv2d(self.imshp, self.kshp,
                               self.border_mode,
                               self.subsample,
                               self.filter_flip,
-                               self.filter_dilation)(bottom, weights)
+                               self.filter_dilation,
+                               self.num_groups)(bottom, weights)
        # Make sure that the broadcastable pattern of the inputs is used
        # for the gradients, even if the grad opts are not able to infer
        # that the dimensions are broadcastable.

--- a/theano/tensor/nnet/corr.py
+++ b/theano/tensor/nnet/corr.py
@@ -40,9 +40,11 @@ class BaseCorrMM(gof.OpenMPOp):
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
        Perform dilated correlation (default: (1,1))
+    num_groups
+        Perform grouped convolutions (default: 1)
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')

    _direction = None

@@ -51,10 +53,11 @@ class BaseCorrMM(gof.OpenMPOp):
                                                ('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')),  # 2
                             dH=int64, dW=int64,
                             dilH=int64, dilW=int64,
-                             padH=int64, padW=int64)
+                             padH=int64, padW=int64,
+                             num_groups=int64)

    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1), openmp=None):
+                 filter_dilation=(1, 1), num_groups=1, openmp=None):
        super(BaseCorrMM, self).__init__(openmp=openmp)
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
@@ -97,6 +100,9 @@ class BaseCorrMM(gof.OpenMPOp):
        if self._direction not in ["forward", "backprop weights", "backprop inputs"]:
            raise ValueError("_direction must be one of 'forward', "
                             "'backprop weights', 'backprop inputs'")
+        if num_groups < 1:
+            raise ValueError("Number of groups should be greater than 0")
+        self.num_groups = num_groups

    @property
    def pad(self):
@@ -124,11 +130,12 @@ class BaseCorrMM(gof.OpenMPOp):
    padW = property(lambda self: self.pad[1])

    def __str__(self):
-        return '%s{%s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
-            str(self.filter_dilation))
+            str(self.filter_dilation),
+            str(self.num_groups))

    @staticmethod
    def as_common_dtype(in1, in2):
@@ -138,6 +145,11 @@ class BaseCorrMM(gof.OpenMPOp):
        dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
        return in1.astype(dtype), in2.astype(dtype)

+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'num_groups'):
+            self.num_groups = 1
+
    def c_support_code(self):
        ccodes = blas_headers.blas_header_text()
        if self.blas_type == 'openblas':
@@ -167,7 +179,7 @@ class BaseCorrMM(gof.OpenMPOp):

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (6, self.openmp, blas_header_version())
+        return (7, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -274,6 +286,7 @@ class BaseCorrMM(gof.OpenMPOp):
    int dilW = %(params)s->dilW;
    int padH = %(params)s->padH;
    int padW = %(params)s->padW;
+    int numgroups = %(params)s->num_groups;

    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
@@ -386,7 +399,7 @@ class BaseCorrMM(gof.OpenMPOp):
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
-        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1];
+        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
        out_dim[2] = (npy_intp)kH;  // already inferred further above
        out_dim[3] = (npy_intp)kW;  // how convenient
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
@@ -409,7 +422,7 @@ class BaseCorrMM(gof.OpenMPOp):
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
-        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1];
+        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
@@ -465,7 +478,7 @@ class BaseCorrMM(gof.OpenMPOp):
    }

    // Call corrMM code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups );
    if (out2==NULL){
       %(fail)s
    }
@@ -541,12 +554,14 @@ class CorrMM(BaseCorrMM):
        top, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
-                                     self.filter_dilation)(weights, top,
-                                                           bottom.shape[-2:])
+                                     self.filter_dilation,
+                                     self.num_groups)(weights, top,
+                                                      bottom.shape[-2:])
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
-                                       self.filter_dilation)(bottom, top,
-                                                             weights.shape[-2:])
+                                       self.filter_dilation,
+                                       self.num_groups)(bottom, top,
+                                                        weights.shape[-2:])
        return d_bottom, d_weights


@@ -600,6 +615,7 @@ class CorrMM_gradWeights(BaseCorrMM):
        imshp = input_shape[0]
        topshp = input_shape[1]
        ssize, imshp = imshp[1], list(imshp[2:])
+        ssize = ssize // self.num_groups
        nkern, topshp = topshp[1], list(topshp[2:])
        height_width = node.inputs[-2:]
        if ((dH != 1) or (padH == -1)):
@@ -632,11 +648,13 @@ class CorrMM_gradWeights(BaseCorrMM):
        weights, = grads
        d_bottom = CorrMM_gradInputs(self.border_mode,
                                     self.subsample,
-                                     self.filter_dilation)(weights, top,
-                                                           bottom.shape[-2:])
+                                     self.filter_dilation,
+                                     self.num_groups)(weights, top,
+                                                      bottom.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
-                       self.filter_dilation)(bottom, weights)
+                       self.filter_dilation,
+                       self.num_groups)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) * 2
                          if len(inp) == 4 else ())
        return (d_bottom, d_top) + d_height_width
@@ -678,8 +696,12 @@ class CorrMM_gradInputs(BaseCorrMM):
            height_width = [as_tensor_variable(shape[0]).astype('int64'),
                            as_tensor_variable(shape[1]).astype('int64')]

-        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
-                         False, False]
+        if self.num_groups > 1:
+            broadcastable = [topgrad.type.broadcastable[0], False,
+                             False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                             False, False]
        dtype = kern.type.dtype
        return Apply(self, [kern, topgrad] + height_width,
                     [TensorType(dtype, broadcastable)()])
@@ -698,6 +720,7 @@ class CorrMM_gradInputs(BaseCorrMM):
        kshp = input_shape[0]
        topshp = input_shape[1]
        ssize, kshp = kshp[1], list(kshp[2:])
+        ssize = ssize * self.num_groups
        bsize, topshp = topshp[0], list(topshp[2:])
        height_width = node.inputs[-2:]
        if padH == -1:
@@ -738,12 +761,14 @@ class CorrMM_gradInputs(BaseCorrMM):
        bottom, = grads
        d_weights = CorrMM_gradWeights(self.border_mode,
                                       self.subsample,
-                                       self.filter_dilation)(bottom,
-                                                             top,
-                                                             weights.shape[-2:])
+                                       self.filter_dilation,
+                                       self.num_groups)(bottom,
+                                                        top,
+                                                        weights.shape[-2:])
        d_top = CorrMM(self.border_mode,
                       self.subsample,
-                       self.filter_dilation)(bottom, weights)
+                       self.filter_dilation,
+                       self.num_groups)(bottom, weights)
        d_height_width = ((theano.gradient.DisconnectedType()(),) *
                          2 if len(inp) == 4 else ())
        return (d_weights, d_top) + d_height_width

--- a/theano/tensor/nnet/corr_gemm.c
+++ b/theano/tensor/nnet/corr_gemm.c
@@ -106,7 +106,8 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
                      const int dilH = 1,
                      const int dilW = 1,
                      const int padH = 0,
-                      const int padW = 0)
+                      const int padW = 0,
+                      const int numgroups = 1)
 {
    if (PyArray_NDIM(bottom) != 4)
    {
@@ -155,7 +156,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    const int nFilters = PyArray_DIMS(weight)[0];
    const int kH = PyArray_DIMS(weight)[2];
    const int kW = PyArray_DIMS(weight)[3];
-    if (nChannels != PyArray_DIMS(weight)[1]) {
+    if (nChannels != (PyArray_DIMS(weight)[1] * numgroups)) {
        PyErr_SetString(PyExc_ValueError,
                "CorrMM images and kernel must have the same stack size\n");
        return NULL;
@@ -214,12 +215,16 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
    }

    // Define some useful variables
-    const int bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
-    const int top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
-    const int K_ = col_dim[1];
+    const int batch_bottom_stride = PyArray_STRIDES(bottom)[0]/%(n_bytes)f;
+    const int group_bottom_stride = (PyArray_STRIDES(bottom)[1] * nChannels / numgroups)/%(n_bytes)f;
+    const int batch_top_stride = PyArray_STRIDES(top)[0]/%(n_bytes)f;
+    const int group_top_stride = (PyArray_STRIDES(top)[1] * nFilters / numgroups)/%(n_bytes)f;
+    const int K_ = col_dim[1] / numgroups;
    const int N_ = col_dim[2];
-    const int col_stride = (K_ * N_);
-    const int M_ = nFilters;
+    const int col_stride = (K_ * N_ * numgroups);
+    const int group_col_stride = (K_ * N_);
+    const int group_weight_stride = (PyArray_STRIDES(weight)[0] * nFilters / numgroups)/%(n_bytes)f;
+    const int M_ = nFilters / numgroups;
    const %(c_float_type)s one = 1.0;
    const %(c_float_type)s zero = 0.0;
    char NTrans = 'N';
@@ -253,17 +258,19 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; ++n) {
            int tid = %(omp_get_thread_num)s;
            // First, im2col
-            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                   bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
+            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride, nChannels,
+                   bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            // Second, gemm
-            %(gemm)s(&NTrans, &NTrans,
-                   &N_, &M_, &K_,
-                   &one,
-                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride, &N_,
-                   (%(float_type)s*)PyArray_DATA(weight), &K_,
-                   &zero,
-                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_);
+            for ( int g = 0; g < numgroups; ++g){
+                // Second, gemm
+                %(gemm)s(&NTrans, &NTrans,
+                       &N_, &M_, &K_,
+                       &one,
+                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                       &zero,
+                       (%(float_type)s*)PyArray_DATA(top) + n * batch_top_stride + g * group_top_stride, &N_);
+            }
        }
        // Restore to previous blas threads
        %(blas_set_num_threads)s(blas_threads_saved);
@@ -304,7 +311,7 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        output = weight;
        npy_intp weight_dim[2];
        weight_dim[0] = (npy_intp)max_threads;
-        weight_dim[1] = (npy_intp)(M_ * K_);
+        weight_dim[1] = (npy_intp)(M_ * K_ * numgroups);
        PyArrayObject* local_weight = (PyArrayObject*)PyArray_ZEROS(2,
                                   weight_dim, PyArray_TYPE(weight), 0);

@@ -326,21 +333,23 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        for (int n = 0; n < batchSize; ++n) {
            int tid = %(omp_get_thread_num)s;
            // First, im2col
-            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride, nChannels, bottomHeight,
-                   bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
+            im2col((%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride,
+                   nChannels, bottomHeight,bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW,
                   (%(float_type)s*)PyArray_DATA(col)+ tid * col_stride);
-            // Second, gemm
-            // Note that we accumulate into weight. We do so by setting beta = 0
-            // for the first iteration and beta = 1 for subsequent ones. (This
-            // is faster than setting weight to all zeros before the loop.)
-            %(gemm)s(&Trans, &NTrans,
-                   &K_, &M_, &N_,
-                   &one,
-                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_,
-                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
-                   (n == 0) ? &zero : &one,
-                   (%(float_type)s*)PyArray_DATA(local_weight) + 
-                   tid * weight_dim[1], &K_);
+            for(int g = 0; g < numgroups; ++g){
+                // Second, gemm
+                // Note that we accumulate into weight. We do so by setting beta = 0
+                // for the first iteration and beta = 1 for subsequent ones. (This
+                // is faster than setting weight to all zeros before the loop.)
+                %(gemm)s(&Trans, &NTrans,
+                       &K_, &M_, &N_,
+                       &one,
+                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_,
+                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride  + n * batch_top_stride, &N_,
+                       (n == 0) ? &zero : &one,
+                       (%(float_type)s*)PyArray_DATA(local_weight) + g * group_weight_stride + 
+                       tid * weight_dim[1], &K_);
+            }
        }
        // Restore to previous blas threads
        %(blas_set_num_threads)s(blas_threads_saved);
@@ -401,19 +410,21 @@ PyArrayObject* corrMM(PyArrayObject* bottom,
        %(blas_set_num_threads)s(1);
        %(omp_flags)s
        for (int n = 0; n < batchSize; ++n) {
-            // gemm into columns
            int tid = %(omp_get_thread_num)s;
-            %(gemm)s(&NTrans, &Trans,
-                   &N_, &K_, &M_,
-                   &one,
-                   (%(float_type)s*)PyArray_DATA(top) + n * top_stride, &N_,
-                   (%(float_type)s*)PyArray_DATA(weight), &K_,
-                   &zero,
-                   (%(float_type)s*)PyArray_DATA(col) + tid * col_stride, &N_);
+            for ( int g = 0;g < numgroups; ++g){
+                // gemm into columns
+                %(gemm)s(&NTrans, &Trans,
+                       &N_, &K_, &M_,
+                       &one,
+                       (%(float_type)s*)PyArray_DATA(top) + g * group_top_stride + n * batch_top_stride, &N_,
+                       (%(float_type)s*)PyArray_DATA(weight) + g * group_weight_stride, &K_,
+                       &zero,
+                       (%(float_type)s*)PyArray_DATA(col) + tid * col_stride + g * group_col_stride, &N_);
+            }
            // col2im back to the data
            col2im((%(float_type)s*)PyArray_DATA(col) + tid * col_stride, nChannels, bottomHeight, bottomWidth,
                   kH, kW, dilH, dilW, padH, padW,
-                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * bottom_stride);
+                   dH, dW, (%(float_type)s*)PyArray_DATA(bottom) + n * batch_bottom_stride);
        }
        // Restore to previous blas threads
        %(blas_set_num_threads)s(blas_threads_saved);

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -88,7 +88,9 @@ def local_abstractconv_gemm(node):
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM(border_mode=node.op.border_mode,
                  subsample=node.op.subsample,
-                  filter_dilation=node.op.filter_dilation)(img, kern)
+                  filter_dilation=node.op.filter_dilation,
+                  num_groups=node.op.num_groups)(img, kern)
+
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -133,7 +135,8 @@ def local_abstractconv_gradweight_gemm(node):

    rval = CorrMM_gradWeights(border_mode=node.op.border_mode,
                              subsample=node.op.subsample,
-                              filter_dilation=node.op.filter_dilation)(img, topgrad, shape)
+                              filter_dilation=node.op.filter_dilation,
+                              num_groups=node.op.num_groups)(img, topgrad, shape)
    copy_stack_trace(node.outputs[0], rval)

    # need to flip the kernel if necessary
@@ -190,8 +193,9 @@ def local_abstractconv_gradinputs_gemm(node):
        kern = kern[:, :, ::-1, ::-1]
    rval = CorrMM_gradInputs(border_mode=node.op.border_mode,
                             subsample=node.op.subsample,
-                             filter_dilation=node.op.filter_dilation)(kern, topgrad,
-                                                                      shape)
+                             filter_dilation=node.op.filter_dilation,
+                             num_groups=node.op.num_groups)(kern, topgrad,
+                                                            shape)
    copy_stack_trace(node.outputs[0], rval)

    return [rval]
@@ -238,6 +242,8 @@ def local_conv2d_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
+    if node.op.num_groups > 1:
+        return None

    rval = conv2d(img, kern,
                  node.op.imshp, node.op.kshp,
@@ -295,6 +301,8 @@ def local_conv2d_gradweight_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return
+    if node.op.num_groups > 1:
+        return None

    if node.op.border_mode == 'valid' and \
            (node.op.subsample != (1, 1)):
@@ -447,6 +455,8 @@ def local_conv2d_gradinputs_cpu(node):
    if not node.op.filter_flip:
        # Not tested yet
        return None
+    if node.op.num_groups > 1:
+        return None

    # Conv 3d implementation, needed when subsample > 2
    if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -1699,3 +1699,158 @@ class TestConv2dGrads(unittest.TestCase):
                                                                                                  )
                        f_new = theano.function([self.x, self.output_grad_wrt], conv_wrt_w_out)
                        utt.assert_allclose(f_new(input_val, out_grad_val), f_old(input_val, filter_val, out_grad_val))
+
+
+class Grouped_conv_noOptim(unittest.TestCase):
+    conv2d = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    conv2d_op = theano.tensor.nnet.abstract_conv.AbstractConv2d
+    conv2d_gradw_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradWeights
+    conv2d_gradi_op = theano.tensor.nnet.abstract_conv.AbstractConv2d_gradInputs
+    mode = theano.Mode(optimizer=None)
+    flip_filter = False
+    is_dnn = False
+
+    def setUp(self):
+        self.num_groups = [3, 2, 4, 4]
+        self.border_mode = 'valid'
+        self.subsample = (1, 1)
+        self.img_shape = [(5, 6, 5, 5), (4, 4, 7, 5), (3, 8, 5, 3), (2, 4, 7, 7)]
+        self.kern_shape = [(6, 2, 3, 3), (6, 2, 5, 3), (4, 2, 3, 3), (4, 1, 3, 5)]
+        self.top_shape = [(5, 6, 3, 3), (4, 6, 3, 3), (3, 4, 3, 1), (2, 4, 5, 3)]
+        self.filter_dilation = (1, 1)
+        self.ref_mode = 'FAST_RUN'
+        if theano.config.cxx == "":
+            raise SkipTest("CorrMM needs cxx")
+
+    def test_fwd(self):
+        img_sym = theano.tensor.tensor4('img')
+        kern_sym = theano.tensor.tensor4('kern')
+        for imshp, kshp, groups in zip(self.img_shape, self.kern_shape, self.num_groups):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            split_imgs = np.split(img, groups, axis=1)
+            split_kern = np.split(kern, groups, axis=0)
+
+            grouped_conv_op = self.conv2d(border_mode=self.border_mode,
+                                          subsample=self.subsample,
+                                          filter_dilation=self.filter_dilation,
+                                          num_groups=groups)
+            if self.flip_filter:
+                grouped_conv_output = grouped_conv_op(img_sym, kern_sym[:, :, ::-1, ::-1])
+            else:
+                grouped_conv_output = grouped_conv_op(img_sym, kern_sym)
+            grouped_func = theano.function([img_sym, kern_sym], grouped_conv_output, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_op)
+                       for node in grouped_func.maker.fgraph.toposort()])
+            grouped_output = grouped_func(img, kern)
+
+            ref_conv_op = conv2d_corr(img_sym,
+                                      kern_sym,
+                                      border_mode=self.border_mode,
+                                      subsample=self.subsample,
+                                      filter_dilation=self.filter_dilation)
+            ref_func = theano.function([img_sym, kern_sym], ref_conv_op,
+                                       mode=self.ref_mode)
+            ref_concat_output = [ref_func(img_arr, kern_arr)
+                                 for img_arr, kern_arr in zip(split_imgs, split_kern)]
+            ref_concat_output = np.concatenate(ref_concat_output, axis=1)
+
+            utt.assert_allclose(grouped_output, ref_concat_output)
+
+            utt.verify_grad(grouped_conv_op,
+                            [img, kern],
+                            mode=self.mode,
+                            eps=1)
+
+    def test_gradweights(self):
+        img_sym = theano.tensor.tensor4('img')
+        top_sym = theano.tensor.tensor4('top')
+        for imshp, kshp, tshp, groups in zip(self.img_shape, self.kern_shape, self.top_shape, self.num_groups):
+            img = np.random.random(imshp).astype(theano.config.floatX)
+            top = np.random.random(tshp).astype(theano.config.floatX)
+            split_imgs = np.split(img, groups, axis=1)
+            split_top = np.split(top, groups, axis=1)
+
+            grouped_convgrad_op = self.conv2d_gradw(border_mode=self.border_mode,
+                                                    subsample=self.subsample,
+                                                    filter_dilation=self.filter_dilation,
+                                                    num_groups=groups)
+            grouped_conv_output = grouped_convgrad_op(img_sym,
+                                                      top_sym,
+                                                      tensor.as_tensor_variable(kshp if self.is_dnn else kshp[-2:]))
+            if self.flip_filter:
+                grouped_conv_output = grouped_conv_output[:, :, ::-1, ::-1]
+            grouped_func = theano.function([img_sym, top_sym], grouped_conv_output, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradw_op)
+                       for node in grouped_func.maker.fgraph.toposort()])
+            grouped_output = grouped_func(img, top)
+
+            ref_conv_op = conv2d_corr_gw(img_sym,
+                                         top_sym,
+                                         kshp,
+                                         border_mode=self.border_mode,
+                                         subsample=self.subsample,
+                                         filter_dilation=self.filter_dilation)
+            ref_func = theano.function([img_sym, top_sym], ref_conv_op,
+                                       mode=self.ref_mode)
+            ref_concat_output = [ref_func(img_arr, top_arr)
+                                 for img_arr, top_arr in zip(split_imgs, split_top)]
+            ref_concat_output = np.concatenate(ref_concat_output, axis=0)
+
+            utt.assert_allclose(grouped_output, ref_concat_output)
+
+            def conv_gradweight(inputs_val, output_val):
+                return grouped_convgrad_op(inputs_val, output_val,
+                                           tensor.as_tensor_variable(kshp if self.is_dnn else kshp[-2:]))
+
+            utt.verify_grad(conv_gradweight,
+                            [img, top],
+                            mode=self.mode, eps=1)
+
+    def test_gradinputs(self):
+        kern_sym = theano.tensor.tensor4('kern')
+        top_sym = theano.tensor.tensor4('top')
+        for imshp, kshp, tshp, groups in zip(self.img_shape, self.kern_shape, self.top_shape, self.num_groups):
+            kern = np.random.random(kshp).astype(theano.config.floatX)
+            top = np.random.random(tshp).astype(theano.config.floatX)
+            split_kerns = np.split(kern, groups, axis=0)
+            split_top = np.split(top, groups, axis=1)
+
+            grouped_convgrad_op = self.conv2d_gradi(border_mode=self.border_mode,
+                                                    subsample=self.subsample,
+                                                    filter_dilation=self.filter_dilation,
+                                                    num_groups=groups)
+            if self.flip_filter:
+                grouped_conv_output = grouped_convgrad_op(kern_sym[:, :, ::-1, ::-1], top_sym, tensor.as_tensor_variable(imshp[-2:]))
+            else:
+                grouped_conv_output = grouped_convgrad_op(kern_sym,
+                                                          top_sym,
+                                                          tensor.as_tensor_variable(imshp if self.is_dnn else imshp[-2:]))
+            grouped_func = theano.function([kern_sym, top_sym], grouped_conv_output, mode=self.mode)
+            assert any([isinstance(node.op, self.conv2d_gradi_op)
+                       for node in grouped_func.maker.fgraph.toposort()])
+            grouped_output = grouped_func(kern, top)
+
+            ref_conv_op = conv2d_corr_gi(kern_sym,
+                                         top_sym,
+                                         imshp,
+                                         border_mode=self.border_mode,
+                                         subsample=self.subsample,
+                                         filter_dilation=self.filter_dilation)
+            ref_func = theano.function([kern_sym, top_sym], ref_conv_op,
+                                       mode=self.ref_mode)
+            ref_concat_output = [ref_func(kern_arr, top_arr)
+                                 for kern_arr, top_arr in zip(split_kerns, split_top)]
+            ref_concat_output = np.concatenate(ref_concat_output, axis=1)
+
+            utt.assert_allclose(grouped_output, ref_concat_output)
+
+            def conv_gradinputs(filters_val, output_val):
+                return grouped_convgrad_op(filters_val, output_val,
+                                           tensor.as_tensor_variable(imshp if self.is_dnn else imshp[-2:]))
+
+            utt.verify_grad(conv_gradinputs,
+                            [kern, top],
+                            mode=self.mode, eps=1)
--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -10,6 +10,7 @@ import theano
 import theano.tensor as T
 from theano.tests import unittest_tools as utt
 from theano.tensor.nnet import corr, conv
+from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim


 class TestCorr2D(utt.InferShapeTester):
@@ -416,6 +417,49 @@ class TestCorr2D(utt.InferShapeTester):
        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 2, non_contiguous=True)


+class TestGroupCorr2d(Grouped_conv_noOptim):
+    if theano.config.mode == "FAST_COMPILE":
+        mode = theano.compile.get_mode("FAST_RUN")
+    else:
+        mode = None
+    conv2d = corr.CorrMM
+    conv2d_gradw = corr.CorrMM_gradWeights
+    conv2d_gradi = corr.CorrMM_gradInputs
+    conv2d_op = corr.CorrMM
+    conv2d_gradw_op = corr.CorrMM_gradWeights
+    conv2d_gradi_op = corr.CorrMM_gradInputs
+    flip_filter = True
+    is_dnn = False
+
+    def test_graph(self):
+        # define common values  first
+        groups = 3
+        bottom = np.random.rand(3, 6, 5, 5).astype(theano.config.floatX)
+        kern = np.random.rand(9, 2, 3, 3).astype(theano.config.floatX)
+        bottom_sym = T.tensor4('bottom')
+        kern_sym = T.tensor4('kern')
+
+        # grouped convolution graph
+        conv_group = self.conv2d(num_groups=groups)(bottom_sym, kern_sym)
+        gconv_func = theano.function([bottom_sym, kern_sym], conv_group, mode=self.mode)
+
+        # Graph for the normal hard way
+        kern_offset = kern_sym.shape[0] // groups
+        bottom_offset = bottom_sym.shape[1] // groups
+        split_conv_output = [self.conv2d()(bottom_sym[:, i * bottom_offset:(i + 1) * bottom_offset, :, :],
+                             kern_sym[i * kern_offset:(i + 1) * kern_offset, :, :, :])
+                             for i in range(groups)]
+        concatenated_output = T.concatenate(split_conv_output, axis=1)
+        conv_func = theano.function([bottom_sym, kern_sym], concatenated_output, mode=self.mode)
+
+        # calculate outputs for each graph
+        gconv_output = gconv_func(bottom, kern)
+        conv_output = conv_func(bottom, kern)
+
+        # compare values
+        utt.assert_allclose(gconv_output, conv_output)
+
+
 if __name__ == '__main__':

    t = TestCorr2D('setUp')