add support for grouped convolution in GpuCorrMM

1765cf40 · Ubuntu · Mohammed Affan · 816cdaf6 · 1765cf40 · 1765cf40
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -489,11 +489,11 @@ class BaseGpuCorrMM(CGpuKernelBase):
        Perform subsampling of the input, also known as dilation (default: (1, 1)).
    """
    check_broadcast = False
-    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups')
    _f16_ok = True
    def __init__(self, border_mode="valid", subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1), num_groups=1):
        if isinstance(border_mode, integer_types):
            border_mode = (border_mode, border_mode)
        if isinstance(border_mode, tuple):
@@ -512,6 +512,9 @@ class BaseGpuCorrMM(CGpuKernelBase):
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
+        if num_groups < 1:
+            raise ValueError("Number of groups should be greater than 0")
+        self.num_groups = num_groups
        CGpuKernelBase.__init__(self, ['corr_gemm.c'])
    @property
@@ -521,11 +524,12 @@ class BaseGpuCorrMM(CGpuKernelBase):
        return (0, 0)
    def __str__(self):
-        return '%s{%s, %s, %s}' % (
+        return '%s{%s, %s, %s, %s}' % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
-            str(self.filter_dilation))
+            str(self.filter_dilation),
+            str(self.num_groups))
    def flops(self, inp, outp):
        """
@@ -600,6 +604,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        """
        dH, dW = self.subsample
        dilH, dilW = self.filter_dilation
+        numgroups = self.num_groups
        if self.border_mode == "half":
            padH = padW = -1
        elif self.border_mode == "full":
@@ -660,6 +665,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    size_t dilW = %(dilW)s;
    int padH = %(padH)s;
    int padW = %(padW)s;
+    int numgroups = %(numgroups)s;
    PyGpuArrayObject * bottom = %(bottom)s;
    PyGpuArrayObject * weights = %(weights)s;
@@ -759,7 +765,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = PyGpuArray_DIMS(top)[1];
-        out_dim[1] = PyGpuArray_DIMS(bottom)[1];
+        out_dim[1] = PyGpuArray_DIMS(bottom)[1] / numgroups;
        out_dim[2] = kH;  // already inferred further above
        out_dim[3] = kW;  // how convenient
        out_typecode = top->ga.typecode;
@@ -783,7 +789,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = PyGpuArray_DIMS(top)[0];
-        out_dim[1] = PyGpuArray_DIMS(weights)[1];
+        out_dim[1] = PyGpuArray_DIMS(weights)[1] * numgroups;
        out_dim[2] = (%(height)s != -1) ? %(height)s : (PyGpuArray_DIMS(top)[2] - 1) * dH + (PyGpuArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH;
        out_dim[3] = (%(width)s != -1) ? %(width)s : (PyGpuArray_DIMS(top)[3] - 1) * dW + (PyGpuArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW;
        out_typecode = top->ga.typecode;
@@ -827,7 +833,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
    }
    // Call GPU code
-    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW);
+    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW, padH, padW, numgroups);
    if (out2==NULL){
       %(fail)s
    }
@@ -883,9 +889,9 @@ class GpuCorrMM(BaseGpuCorrMM):
    """
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1), num_groups=1):
        super(GpuCorrMM, self).__init__(border_mode, subsample,
-                                        filter_dilation)
+                                        filter_dilation, num_groups)
    def make_node(self, img, kern):
        ctx_name = infer_context_name(img, kern)
@@ -914,11 +920,13 @@ class GpuCorrMM(BaseGpuCorrMM):
        top = gpu_contiguous(top)
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
-                                        self.filter_dilation)(
+                                        self.filter_dilation,
+                                        self.num_groups)(
            weights, top, bottom.shape[-2:])
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
-                                          self.filter_dilation)(
+                                          self.filter_dilation,
+                                          self.num_groups)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights
@@ -936,10 +944,11 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(GpuCorrMM_gradWeights, self).__init__(border_mode,
                                                    subsample,
-                                                    filter_dilation)
+                                                    filter_dilation, num_groups)
    def make_node(self, img, topgrad, shape=None):
        ctx_name = infer_context_name(img, topgrad)
@@ -978,11 +987,12 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
        weights = gpu_contiguous(weights)
        d_bottom = GpuCorrMM_gradInputs(self.border_mode,
                                        self.subsample,
-                                        self.filter_dilation)(weights,
+                                        self.filter_dilation,
-                                                              top,
+                                        self.num_groups)(weights,
-                                                              bottom.shape[-2:])
+                                                         top,
+                                                         bottom.shape[-2:])
        d_top = GpuCorrMM(
-            self.border_mode, self.subsample, self.filter_dilation)(bottom, weights)
+            self.border_mode, self.subsample, self.filter_dilation, self.num_groups)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()
@@ -1008,9 +1018,10 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
    def __init__(self, border_mode="valid",
                 subsample=(1, 1),
-                 filter_dilation=(1, 1)):
+                 filter_dilation=(1, 1),
+                 num_groups=1):
        super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample,
-                                                   filter_dilation)
+                                                   filter_dilation, num_groups)
    def make_node(self, kern, topgrad, shape=None):
        ctx_name = infer_context_name(kern, topgrad)
@@ -1029,8 +1040,12 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            assert shape[0].ndim == 0
            assert shape[1].ndim == 0
-        broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+        if self.num_groups > 1:
-                         False, False]
+            broadcastable = [topgrad.type.broadcastable[0], False,
+                             False, False]
+        else:
+            broadcastable = [topgrad.type.broadcastable[0], kern.type.broadcastable[1],
+                             False, False]
        return Apply(self, [kern, topgrad] + height_width, [GpuArrayType(dtype=topgrad.dtype,
                                                                         context_name=ctx_name,
                                                                         broadcastable=broadcastable)()])
@@ -1048,12 +1063,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
        bottom = gpu_contiguous(bottom)
        d_weights = GpuCorrMM_gradWeights(self.border_mode,
                                          self.subsample,
-                                          self.filter_dilation)(bottom,
+                                          self.filter_dilation,
-                                                                top,
+                                          self.num_groups)(bottom,
-                                                                weights.shape[-2:])
+                                                           top,
+                                                           weights.shape[-2:])
        d_top = GpuCorrMM(self.border_mode,
                          self.subsample,
-                          self.filter_dilation)(bottom, weights)
+                          self.filter_dilation,
+                          self.num_groups)(bottom, weights)
        d_height_width = (
            theano.gradient.DisconnectedType()(),
            ) * 2 if len(inp) == 4 else ()

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -348,7 +348,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         const size_t dilH = 1,
                         const size_t dilW = 1,
                         const size_t padH = 0,
-                         const size_t padW = 0)
+                         const size_t padW = 0,
+                         const size_t numgroups = 1)
 {
    if (PyGpuArray_NDIM(bottom) != 4)
    {
@@ -411,8 +412,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
-    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
+    if (nChannels != (PyGpuArray_DIMS(weight)[1] * numgroups)) {
-        PyErr_SetString(PyExc_ValueError,
+        PyErr_Format(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
@@ -469,11 +470,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    }
    // Define some useful variables
-    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t batch_bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
-    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
+    const size_t batch_top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
-    const size_t K_ = col_dim[0];
+    const size_t group_bottom_stride = (PyGpuArray_STRIDES(bottom)[1] * nChannels / numgroups) / gpuarray_get_elsize(bottom->ga.typecode);
+    const size_t group_top_stride = (PyGpuArray_STRIDES(top)[1] * nFilters / numgroups) / gpuarray_get_elsize(top->ga.typecode);
+    const size_t group_weight_stride = (PyGpuArray_STRIDES(weight)[0] * nFilters / numgroups) / gpuarray_get_elsize(weight->ga.typecode);
+    const size_t K_ = col_dim[0] / numgroups;
    const size_t N_ = col_dim[1];
-    const size_t M_ = nFilters;
+    const size_t group_col_stride = (K_ * N_);
+    const size_t M_ = nFilters / numgroups;
    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
@@ -493,21 +498,23 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-	  err = im2col(&bottom->ga, n * bottom_stride,
+            err = im2col(&bottom->ga, n * batch_bottom_stride,
-		       nChannels, bottomHeight,
+                         nChannels, bottomHeight,
-		       bottomWidth, kH, kW, dilH, dilW,
+                         bottomWidth, kH, kW, dilH, dilW,
-		       padH, padW, dH, dW, &col->ga);
+                         padH, padW, dH, dW, &col->ga);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
-            err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
+            for (size_t g = 0; g < numgroups; g++){
-                        N_, M_, K_, 1,
+                err = rgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                        &col->ga, 0, N_,
+                            N_, M_, K_, 1,
-                        &weight->ga, 0, K_,
+                            &col->ga, g * group_col_stride, N_,
-                        0,
+                            &weight->ga, g * group_weight_stride, K_,
-                        &top->ga, n * top_stride, N_);
+                            0,
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM forward encountered an error running gemm: %d", err);
@@ -533,7 +540,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(&bottom->ga, n * bottom_stride,
+            err = im2col(&bottom->ga, n * batch_bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, &col->ga);
@@ -545,12 +552,14 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            err = rgemm(cb_fortran, cb_trans, cb_no_trans,
+            for(size_t g = 0; g < numgroups; g++){ 
-                        K_, M_, N_, 1,
+                err = rgemm(cb_fortran, cb_trans, cb_no_trans,
-                        &col->ga, 0, N_,
+                            K_, M_, N_, 1,
-                        &top->ga, n * top_stride, N_,
+                            &col->ga, g * group_col_stride, N_,
-                        (n == 0) ? 0 : 1,
+                            &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                        &weight->ga, 0, K_);
+                            (n == 0) ? 0 : 1,
+                            &weight->ga, g * group_weight_stride, K_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
@@ -575,13 +584,15 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
-          // gemm into columns
+            // gemm into columns
-          err = rgemm(cb_fortran, cb_no_trans, cb_trans,
+            for(size_t g = 0; g < numgroups; g++){
-                      N_, K_, M_, 1,
+              err = rgemm(cb_fortran, cb_no_trans, cb_trans,
-                      &top->ga, n * top_stride, N_,
+                          N_, K_, M_, 1,
-                      &weight->ga, 0, K_,
+                          &top->ga, n * batch_top_stride + g * group_top_stride, N_,
-                      0,
+                          &weight->ga, g * group_weight_stride, K_,
-                      &col->ga, 0, N_);
+                          0,
+                          &col->ga, g * group_col_stride, N_);
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
@@ -591,7 +602,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // col2im back to the data
            err = col2im(&col->ga, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
-                         dH, dW, &bottom->ga, n * bottom_stride);
+                         dH, dW, &bottom->ga, n * batch_bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1509,7 +1509,8 @@ def local_abstractconv_gemm(node):
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
-    if ((border_mode == 'full') and (subsample == (1, 1))):
+    if ((border_mode == 'full') and (subsample == (1, 1)) and node.op.num_groups == 1):
        if not node.op.filter_flip:
            kern = kern[:, :, ::-1, ::-1]
        # need to dimshuffle the kernel for full convolution
@@ -1526,8 +1527,9 @@ def local_abstractconv_gemm(node):
        # By default use GpuCorrMM
        rval = GpuCorrMM(border_mode,
                         subsample,
-                         filter_dilation)(gpu_contiguous(img),
+                         filter_dilation,
-                                          gpu_contiguous(kern))
+                         node.op.num_groups)(gpu_contiguous(img),
+                                             gpu_contiguous(kern))
        # call GpuCorrMM_gradWeights if good
        # (the latter is faster if batchsize * kernelHeight * kernelWidth
@@ -1645,7 +1647,8 @@ def local_abstractconv_gradweights_gemm(node):
    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
                                 subsample=node.op.subsample,
-                                 filter_dilation=node.op.filter_dilation)(
+                                 filter_dilation=node.op.filter_dilation,
+                                 num_groups=node.op.num_groups)(
        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
    if node.op.filter_flip:
        rval = rval[:, :, ::-1, ::-1]
@@ -1689,7 +1692,8 @@ def local_abstractconv_gradinputs_gemm(node):
    rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
                                subsample=node.op.subsample,
-                                filter_dilation=node.op.filter_dilation)(
+                                filter_dilation=node.op.filter_dilation,
+                                num_groups=node.op.num_groups)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return [rval]