Merge pull request #2023 from stencilman/conv_gemm

Conv gemm non-square kernel support

Merge pull request #2023 from stencilman/conv_gemm
0037c724 · abergeron · 74e83a72 · b5e340ba · 0037c724 · 0037c724
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -53,19 +53,18 @@ TODO: Give examples for how to use these things! They are pretty complicated.
      Also, there is restrictions on which shape are supported.
    - :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
      This is a GPU-only version of a correlation that computes correlations
-      as `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`. 
+      as `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`_.
      For each element in a batch, it first creates a 
-      Toeplitz<http://en.wikipedia.org/wiki/Toeplitz_matrix> matrix in a cuda kernel. 
+      `Toeplitz <http://en.wikipedia.org/wiki/Toeplitz_matrix>`_ matrix in a cuda kernel.
-      Then, it performs a `gemm` call to multiply this Toeplitz matrix and to the kernel. 
+      Then, it performs a ``gemm`` call to multiply this Toeplitz matrix and the kernel.
-      It need extra memory for this, which is the size of the Toeplitz matrix. Precisely, 
+      It need extra memory equal to the size of the Toeplitz matrix. Precisely, 
-      the dimensions of this Toeplitz matrix is equal to 
+      the dimensions of this 2D Toeplitz matrix is equal to
-      (no of channels * filter width * filter height, output width * output height).
+      ``(no of channels * filter width * filter height, output width * output height)``.
-      You can enable it for call to conv2d 2d by setting 'THEANO_FLAGS=optimizer_including=conv_gemm'
+      You can enable it for call to conv2d 2d by setting ``THEANO_FLAGS=optimizer_including=conv_gemm``
      in your environment. This is not enabled by default because it
-      uses some extra memory. It don't support strides for now and requires square kernels.
+      uses some extra memory. MM mean matrix multiply.
 .. autofunction:: theano.tensor.nnet.conv.conv2d
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
 .. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
-.. autofunction:: theano.sandbox.cuda.blas.GpuCorrMM
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -501,16 +501,21 @@ gpu_ger_inplace = GpuGer(inplace=True)
 class GpuCorrMM(GpuOp):
-    """
+    """GPU correlation implementation using Matrix Multiply.
-    Author: Arjun Jain
-    Implement the caffe convolution
+    :note: It don't implement the grad. So you should use it by
+        enabling the Theano flag ``optimizer_including=conv_gemm`` and
+        use :func:`conv2d <theano.tensor.nnet.conv.conv2d>`.
    """
    def __init__(self, border_mode,
            subsample=(1, 1),
            pad=0):
        """
        :param border_mode: "valid" or "full"
-        :param subsample: not yet supported
+        :param subsample: the subsample operation applied on each output image.
+            Should be a tuple with 2 elements.
+            (sv, sh) is equivalent to GpuCorrMM(...)(...)[:,:,::sv, ::sh]
        :param pad: not yet supported
        """
        self.border_mode = border_mode
@@ -519,9 +524,6 @@ class GpuCorrMM(GpuOp):
        if pad != 0:
            raise NotImplementedError(
                "GpuCorrMM don't implement the pad parameter")
-        if subsample != (1, 1):
-            raise NotImplementedError(
-                "GpuCorrMM we don't implement the subsample parameter")
    def __eq__(self, other):
        return type(self) == type(other) \
@@ -557,7 +559,6 @@ class GpuCorrMM(GpuOp):
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
    def flops(self, inputs, outputs):
-        """ Useful with the hack in profilemode to print the MFlops"""
        images, kerns = inputs
        out, = outputs
        assert images[1] == kerns[1]
@@ -611,7 +612,9 @@ class GpuCorrMM(GpuOp):
    //Optional args
    int dx = %(dx)s;
    int dy = %(dy)s;
-    int pad = 0;
+    int padH = 0;
+    int padW = 0;
    CudaNdarray * img = %(img)s;
    CudaNdarray * kern = %(kern)s;
    CudaNdarray * out2 = NULL;
@@ -630,7 +633,9 @@ class GpuCorrMM(GpuOp):
    {
        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
-        pad = CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        padH = CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        padW = CudaNdarray_HOST_DIMS(kern)[3] - 1;
    }
    out_dim[2] = ceil_intdiv(logical_rows, dx);
    out_dim[3] = ceil_intdiv(logical_cols, dy);
@@ -648,7 +653,7 @@ class GpuCorrMM(GpuOp):
    }
-    out2 = corrMM(%(img)s, %(kern)s, %(out)s, pad);
+    out2 = corrMM(%(img)s, %(kern)s, %(out)s, dx, dy, padH, padW);
    if (out2==NULL){
       %(fail)s
    }

--- a/theano/sandbox/cuda/caffe_common.hpp
+++ b/theano/sandbox/cuda/caffe_common.hpp
@@ -30,12 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cuda.h>
 #include <driver_types.h>  // cuda driver types
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-i < (n); \
-i += blockDim.x * gridDim.x)
 // CUDA: thread number configuration.
 // Use 1024 threads per block, which requires cuda sm_2x or above,
 // or fall back to attempt compatibility (best of luck to you).

--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
@@ -22,30 +22,44 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
 #undef _GLIBCXX_ATOMIC_BUILTINS
 #include <Python.h>
 #include "cuda_ndarray.cuh"
 #include "caffe_common.hpp"
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+// Use 1024 threads per block, which requires cuda sm_2x or above
+const int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
 // Kernel for fast unfold+copy
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
-// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
 __global__ void im2col_kernel(const int n, const float* data_im,
-                              const int height, const int width, const int ksize, const int pad,
+                              const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
-                              const int stride, const int height_col, const int width_col,
+			      const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
                              float* data_col) {
  CUDA_KERNEL_LOOP(index, n) {
    int w_out = index % width_col;
    index /= width_col;
    int h_out = index % height_col;
    int channel_in = index / height_col;
-    int channel_out = channel_in * ksize * ksize;
+    int channel_out = channel_in * ksize_h * ksize_w;
-    int h_in = h_out * stride - pad;
+    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride - pad;
+    int w_in = w_out * stride_w - pad_w;
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    data_im += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < ksize; ++i) {
+    for (int i = 0; i < ksize_h; ++i) {
-      for (int j = 0; j < ksize; ++j) {
+      for (int j = 0; j < ksize_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
@@ -57,18 +71,17 @@ __global__ void im2col_kernel(const int n, const float* data_im,
 }
 void im2col(const float* data_im, const int channels,
-            const int height, const int width, const int ksize, const int pad,
+            const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
-            const int stride, float* data_col) {
+	    const int pad_w, const int stride_h, const int stride_w, float* data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;
  // Launch
-  im2col_kernel <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>> (
+  im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>> (
-     num_kernels, data_im, height, width, ksize, 
+                                                                 num_kernels, data_im, height, width, ksize_h, ksize_w, 
-     pad, stride, 
+                                                                 pad_h, pad_w, stride_h, stride_w, 
                                                                 height_col, width_col, data_col
                                                                 );
 }
@@ -79,7 +92,10 @@ void im2col(const float* data_im, const int channels,
 CudaNdarray* corrMM(const CudaNdarray *input, 
 				      CudaNdarray *weight,
 				      CudaNdarray *output,
-				      int padding = 0) 
+				      int dH = 1, 
+     				  int dW = 1,
+				      int padH = 0, 
+				      int padW = 0) 
 {
  	cublasStatus_t status;
@@ -94,30 +110,12 @@ CudaNdarray* corrMM(const CudaNdarray *input,
        PyErr_SetString(PyExc_ValueError, "required weight of 4D");
    }
-     // TODO: stride(dW, dH) and padding as function parameter
-     int dH = 1; 
-     int dW = 1;
     int kH = CudaNdarray_HOST_DIMS(weight)[2];
     int kW = CudaNdarray_HOST_DIMS(weight)[3];
     int nInputPlane = CudaNdarray_HOST_DIMS(input)[1]; 
     // filters: (number of filters, nInputPlane, rows, columns)
     int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
     long batchSize = CudaNdarray_HOST_DIMS(input)[0];
-     if (CudaNdarray_HOST_DIMS(input)[2] != CudaNdarray_HOST_DIMS(input)[3]){
-       PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM support only square images. Got %dx%d images\n",
-		    CudaNdarray_HOST_DIMS(input)[2],
-		    CudaNdarray_HOST_DIMS(input)[3]
-		    );
-       return NULL;
-     }
-     if (kW != kH){
-       PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM support only square kernel. Got %dx%d kernel\n",
-		    kW, kH
-		    );
-       return NULL;
-     }
     if (CudaNdarray_HOST_DIMS(input)[1]  != CudaNdarray_HOST_DIMS(weight)[1]){
       PyErr_SetString(PyExc_ValueError,
                    "GpuCorrMM images and kernel must have the same stack size\n"
@@ -126,18 +124,20 @@ CudaNdarray* corrMM(const CudaNdarray *input,
     }
     long inputHeight  = CudaNdarray_HOST_DIMS(input)[2];
     long inputWidth   = CudaNdarray_HOST_DIMS(input)[3];
-     long outputWidth  = (inputWidth + 2*padding - kW) / dW + 1;
+     long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-     long outputHeight = (inputHeight + 2*padding - kH) / dH + 1;
+     long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
     // check output, size (batchSize, nOutputPlane,
     //		outputHeight, outputWidth);
     if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
 	 nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
 	 outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
 	 outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
-       PyErr_SetString(PyExc_ValueError,
+	   PyErr_Format(
-                    "GpuCorrMM outputs parameter don't have the good shape\n"
+	       PyExc_ValueError,
-		    );
+	       "GpuCorrMM outputs parameter don't have the good shape %d %d %d %d, %d %d %d %d\n",
+	       batchSize, nOutputPlane, outputHeight, outputWidth,
+	       CudaNdarray_HOST_DIMS(output)[0], CudaNdarray_HOST_DIMS(output)[1],
+	       CudaNdarray_HOST_DIMS(output)[2], CudaNdarray_HOST_DIMS(output)[3]);
       return NULL;
     }
     // Create temporary columns
@@ -158,7 +158,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
        // 1. Extract columns:
        im2col(
           input->devdata + elt*ip_stride,
-           nInputPlane, inputWidth, inputHeight, kW, padding, dW, 
+           nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, 
           columns->devdata
         );

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1286,13 +1286,12 @@ def local_gpu_downsample_factor_max_grad(node):
 @local_optimizer([GpuConv])
 def local_conv_gemm(node):
    if (isinstance(node.op, GpuConv) and
-        node.op.border_mode in ['full', 'valid'] and
+        node.op.border_mode in ['full', 'valid']):
-        node.op.subsample == (1, 1)):
        img, kern = node.inputs
        img = gpu_contiguous(img)
        kern = kern[:, :, ::-1, ::-1]
        kern = gpu_contiguous(kern)
-        return [GpuCorrMM(node.op.border_mode)(img, kern)]
+        return [GpuCorrMM(node.op.border_mode, node.op.subsample)(img, kern)]
 gpu_optimizer.register("conv_gemm", local_conv_gemm)

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -114,6 +114,7 @@ def py_conv_scipy(img, kern, mode, subsample):
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
            for s in xrange(img.shape[1]):
+                #convolve2d or correlate
                out[b, k, :, :] += convolve2d(img[b, s, :, :],
                                  kern[k, s, :, :],
                                  mode)
@@ -261,7 +262,6 @@ def exec_conv(version, shapes, verbose, random, mode,
    failed_version = set()
    failed_id = []
-    # I put -1 in case we forget to add version in the test to.
    for ver in version:
        for id, (ishape, kshape, subshape,
                 istride, kstride) in enumerate(shapes):
@@ -615,7 +615,7 @@ def test_valid_9_10():
              print_=print_, ones=ones, rtol=1.1e-5)
-def test_valid():
+def test_valid(conv_gemm=False):
    seed_rng()
    shapes = get_valid_shapes()
@@ -624,7 +624,6 @@ def test_valid():
    # I put -2 to test the reference version.
    version = [-2, -1, 6]
    verbose = 0
-#    version=[1]
    random = True
    print_ = False
@@ -632,26 +631,25 @@ def test_valid():
    if ones:
        random = False
-#    exec_conv(version, shapes, verbose, random, 'valid',
+    if conv_gemm:
-#              print_=print_, ones=ones, rtol=1.1e-5)
+        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
+        cls = cuda.blas.GpuCorrMM
-    version = [-1]
+        version = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
-    # Remove case not supported
        # Add tests with strided inputs by still square images and filters.
        shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
        shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
-    # Keep only tests with square images and filters even with inputs strides
+    else:
-    shapes = [shp for shp in shapes if (
+        mode = cls = None
-        shp[0][2]/shp[3][0] == shp[0][3]/shp[3][1] and
-        shp[1][2]/shp[4][0] == shp[1][3]/shp[4][1])]
    exec_conv(version, shapes, verbose, random, 'valid',
              print_=print_, ones=ones, rtol=1.1e-5,
-              theano_mode=mode, cls=cuda.blas.GpuCorrMM)
+              theano_mode=mode, cls=cls)
+def test_gemm_valid():
+    test_valid(conv_gemm=True)
-def test_full():
+def test_full(conv_gemm=False):
    seed_rng()
    shapes = get_basic_shapes()
    shapes += get_shapes2()
@@ -708,24 +706,24 @@ def test_full():
 #    shapes=shapes[:277]
    version = [-2, -1, 0, 1, 2, 3, 4, 5]
    verbose = 0
-#    version=[4]
    random = True
-#    exec_conv(version, shapes, verbose, random, 'full')
+    if conv_gemm:
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
+        cls = cuda.blas.GpuCorrMM
-    shapes = [shp for shp in shapes if shp[1][2] == shp[1][3]]
+        version = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
-    shapes = [shp for shp in shapes if shp[0][2] == shp[0][3]]
+    else:
-    shapes = shapes[0:10]
+        mode = cls = None
    exec_conv(version, shapes, verbose, random, 'full',
-              theano_mode=mode, cls=cuda.blas.GpuCorrMM)
+              theano_mode=mode, cls=cls)
+def test_gemm_full():
+    test_full(conv_gemm=True)
-def test_subsample():
+def test_subsample(conv_gemm=False):
    seed_rng()
-    # implement when
    shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
              ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
              ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
@@ -747,10 +745,23 @@ def test_subsample():
    if ones:
        random = False
+    if conv_gemm:
+        # Test the GpuCorrMM version
+        mode = theano_mode.including("conv_gemm")
+        cls = cuda.blas.GpuCorrMM
+        version_valid = version_full = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
+    else:
+        mode = cls = None
    exec_conv(version_valid, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones)
+              print_=print_, ones=ones,
+              theano_mode=mode, cls=cls)
    exec_conv(version_full, shapes, verbose, random, 'full',
-              print_=print_, ones=ones)
+              print_=print_, ones=ones,
+              theano_mode=mode, cls=cls)
+def test_gemm_subsample():
+    test_subsample(conv_gemm=True)
 class TestConv2DGPU(unittest.TestCase):
@@ -825,23 +836,28 @@ class TestConv2DGPU(unittest.TestCase):
-def test_gemm():
+def test_gemm_directly():
    """
    input: (batch size, channels, rows, columns)
    filters: (number of filters, channels, rows, columns)
    """
-    for mode in ['valid', 'full']:
+    for mode in ['full', 'valid']:
        print 'Testing mode: ' + mode
        for bs in range(1, 5):
            for ch in range(1,4):
                for nf in range(1,4):
-                    for rImg in range(5, 9):
+                    for rImg1 in range(5, 9):
-                        for rFlt in range(2, 4):
+                        for rImg2 in range(5, 9):
-                            ishape = (bs, ch, rImg, rImg)
+                            for rFlt1 in range(2, 4):
-                            kshape = (nf, ch, rFlt, rFlt)
+                                for rFlt2 in range(2, 4):
+                                    for subsx in range(1, 3):
+                                        for subsy in range(1, 3):
+                                            ishape = (bs, ch, rImg1, rImg2)
+                                            kshape = (nf, ch, rFlt1, rFlt2)
                                            print "ishape: ", ishape
                                            print "kshape: ", kshape 
-                            subsample = (1, 1)
+                                            subsample = (subsx, subsy)
+                                            print "subsample: ", subsample 
                                            npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
                                            npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
@@ -849,24 +865,16 @@ def test_gemm():
                                            i = cuda_tensor4()
                                            k = cuda_tensor4()
-                            t2 = None
-                            t0 = time.time()
                                            cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-                            t1 = time.time()
+                                            op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=mode, \
+                                                    subsample=subsample)(i, k)
-                            op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=mode)(i, k)
                                            f = theano.function([i, k], op, mode=theano_mode)
-                            for k in range(npy_kern.shape[0]):
+                                            npy_kern = npy_kern[:,:,::-1,::-1]
-                                for s in range(npy_kern.shape[1]):
-                                    npy_kern[k,s,:,:] = numpy.rot90(npy_kern[k,s,:,:], 2)
                                            gpuval = f(npy_img, npy_kern)
-                            t2 = time.time()
                                            gpuval = numpy.asarray(gpuval)
                                            rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
                                            assert (rval == True)

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -40,7 +40,9 @@ from theano.gradient import grad_undefined
 #the output function is only defined when dr, dc, dt are natural numbers.
 class Conv3D(theano.Op):
-    """ 3D "convolution" of multiple filters on a minibatch (does not flip the kernel, moves kernel with a user specified stride) """
+    """ 3D `convolution` of multiple filters on a minibatch
+        :note: does not flip the kernel, moves kernel with a user specified stride
+    """
    def __eq__(self,other):
        return type(self) == type(other)