Support for non-square images and kernels

85b8a90f · Arjun Jain · 4c55bc4b · 85b8a90f · 85b8a90f · 85b8a90f
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -606,7 +606,9 @@ class GpuCorrMM(GpuOp):
    //Optional args
    int dx = %(dx)s;
    int dy = %(dy)s;
-    int pad = 0;
+    int padH = 0;
+    int padW = 0;
    CudaNdarray * img = %(img)s;
    CudaNdarray * kern = %(kern)s;
    CudaNdarray * out2 = NULL;
@@ -640,7 +642,9 @@ class GpuCorrMM(GpuOp):
    {
        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
-        pad = CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        padH = CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        padW = CudaNdarray_HOST_DIMS(kern)[3] - 1;
    }
    out_dim[2] = ceil_intdiv(logical_rows, dx);
    out_dim[3] = ceil_intdiv(logical_cols, dy);
@@ -658,7 +662,7 @@ class GpuCorrMM(GpuOp):
    }
-    out2 = corrMM(%(img)s, %(kern)s, %(out)s, pad);
+    out2 = corrMM(%(img)s, %(kern)s, %(out)s, padH, padW);
    if (out2==NULL){
       %(fail)s
    }

--- a/theano/sandbox/cuda/caffe_common.hpp
+++ b/theano/sandbox/cuda/caffe_common.hpp
@@ -30,12 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cuda.h>
 #include <driver_types.h>  // cuda driver types
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-i < (n); \
-i += blockDim.x * gridDim.x)
 // CUDA: thread number configuration.
 // Use 1024 threads per block, which requires cuda sm_2x or above,
 // or fall back to attempt compatibility (best of luck to you).

--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
@@ -22,30 +22,44 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
 #undef _GLIBCXX_ATOMIC_BUILTINS
 #include <Python.h>
 #include "cuda_ndarray.cuh"
 #include "caffe_common.hpp"
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+// Use 1024 threads per block, which requires cuda sm_2x or above
+const int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
 // Kernel for fast unfold+copy
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
-// Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
 __global__ void im2col_kernel(const int n, const float* data_im,
-                              const int height, const int width, const int ksize, const int pad,
+                              const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
-                              const int stride, const int height_col, const int width_col,
+			      const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
                              float* data_col) {
  CUDA_KERNEL_LOOP(index, n) {
    int w_out = index % width_col;
    index /= width_col;
    int h_out = index % height_col;
    int channel_in = index / height_col;
-    int channel_out = channel_in * ksize * ksize;
+    int channel_out = channel_in * ksize_h * ksize_w;
-    int h_in = h_out * stride - pad;
+    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride - pad;
+    int w_in = w_out * stride_w - pad_w;
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    data_im += (channel_in * height + h_in) * width + w_in;
-    for (int i = 0; i < ksize; ++i) {
+    for (int i = 0; i < ksize_h; ++i) {
-      for (int j = 0; j < ksize; ++j) {
+      for (int j = 0; j < ksize_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
@@ -57,20 +71,19 @@ __global__ void im2col_kernel(const int n, const float* data_im,
 }
 void im2col(const float* data_im, const int channels,
-            const int height, const int width, const int ksize, const int pad,
+            const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
-            const int stride, float* data_col) {
+	    const int pad_w, const int stride_h, const int stride_w, float* data_col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
-  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;
  // Launch
-  im2col_kernel <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>> (
+  im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>> (
-     num_kernels, data_im, height, width, ksize, 
+                                                                 num_kernels, data_im, height, width, ksize_h, ksize_w, 
-     pad, stride, 
+                                                                 pad_h, pad_w, stride_h, stride_w, 
-     height_col, width_col, data_col
+                                                                 height_col, width_col, data_col
-     );
+                                                                 );
 }
@@ -79,7 +92,7 @@ void im2col(const float* data_im, const int channels,
 CudaNdarray* corrMM(const CudaNdarray *input, 
 				      CudaNdarray *weight,
 				      CudaNdarray *output,
-				      int padding = 0) 
+				      int padH, int padW = 0) 
 {
  	cublasStatus_t status;
@@ -103,21 +116,6 @@ CudaNdarray* corrMM(const CudaNdarray *input,
     // filters: (number of filters, nInputPlane, rows, columns)
     int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
     long batchSize = CudaNdarray_HOST_DIMS(input)[0];
-     if (CudaNdarray_HOST_DIMS(input)[2] != CudaNdarray_HOST_DIMS(input)[3]){
-       PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM support only square images. Got %dx%d images\n",
-		    CudaNdarray_HOST_DIMS(input)[2],
-		    CudaNdarray_HOST_DIMS(input)[3]
-		    );
-       return NULL;
-     }
-     if (kW != kH){
-       PyErr_Format(PyExc_ValueError,
-                    "GpuCorrMM support only square kernel. Got %dx%d kernel\n",
-		    kW, kH
-		    );
-       return NULL;
-     }
     if (CudaNdarray_HOST_DIMS(input)[1]  != CudaNdarray_HOST_DIMS(weight)[1]){
       PyErr_SetString(PyExc_ValueError,
                    "GpuCorrMM images and kernel must have the same stack size\n"
@@ -126,18 +124,20 @@ CudaNdarray* corrMM(const CudaNdarray *input,
     }
     long inputHeight  = CudaNdarray_HOST_DIMS(input)[2];
     long inputWidth   = CudaNdarray_HOST_DIMS(input)[3];
-     long outputWidth  = (inputWidth + 2*padding - kW) / dW + 1;
+     long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-     long outputHeight = (inputHeight + 2*padding - kH) / dH + 1;
+     long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
     // check output, size (batchSize, nOutputPlane,
     //		outputHeight, outputWidth);
     if (batchSize != CudaNdarray_HOST_DIMS(output)[0] ||
 	 nOutputPlane != CudaNdarray_HOST_DIMS(output)[1] ||
 	 outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
 	 outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
-       PyErr_SetString(PyExc_ValueError,
+	   PyErr_SetString(PyExc_ValueError,
-                    "GpuCorrMM outputs parameter don't have the good shape\n"
+           "GpuCorrMM outputs parameter don't have the good shape");
-		    );
+       printf("GpuCorrMM outputs parameter don't have the good shape %d %d %d %d, %d %d %d %d\n",
+           batchSize, nOutputPlane, outputHeight, outputWidth, CudaNdarray_HOST_DIMS(output)[0],
+           CudaNdarray_HOST_DIMS(output)[1], CudaNdarray_HOST_DIMS(output)[2],
+           CudaNdarray_HOST_DIMS(output)[3]);
       return NULL;
     }
     // Create temporary columns
@@ -158,7 +158,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
        // 1. Extract columns:
        im2col(
           input->devdata + elt*ip_stride,
-           nInputPlane, inputWidth, inputHeight, kW, padding, dW, 
+           nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, 
           columns->devdata
         );

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -7,6 +7,7 @@ import unittest
 import numpy
+import scipy
 from nose.plugins.skip import SkipTest
 imported_scipy_convolve2d = False
@@ -114,7 +115,8 @@ def py_conv_scipy(img, kern, mode, subsample):
    for b in xrange(out.shape[0]):
        for k in xrange(out.shape[1]):
            for s in xrange(img.shape[1]):
-                out[b, k, :, :] += convolve2d(img[b, s, :, :],
+                #convolve2d or correlate
+                out[b, k, :, :] += scipy.signal.convolve2d(img[b, s, :, :],
                                  kern[k, s, :, :],
                                  mode)
    return out[:, :, ::subsample[0], ::subsample[1]]
@@ -830,47 +832,47 @@ def test_gemm():
    input: (batch size, channels, rows, columns)
    filters: (number of filters, channels, rows, columns)
    """
-    for mode in ['valid', 'full']:
+    for mode in ['full', 'valid']:
        print 'Testing mode: ' + mode
        for bs in range(1, 5):
            for ch in range(1,4):
                for nf in range(1,4):
-                    for rImg in range(5, 9):
+                    for rImg1 in range(5, 9):
-                        for rFlt in range(2, 4):
+                        for rImg2 in range(5, 9):
-                            ishape = (bs, ch, rImg, rImg)
+                            for rFlt1 in range(2, 4):
-                            kshape = (nf, ch, rFlt, rFlt)
+                                for rFlt2 in range(2, 4):
-                            print "ishape: ", ishape
+                                    ishape = (bs, ch, rImg1, rImg2)
-                            print "kshape: ", kshape 
+                                    kshape = (nf, ch, rFlt1, rFlt2)
-                            subsample = (1, 1)
+                                    print "ishape: ", ishape
+                                    print "kshape: ", kshape 
-                            npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
+                                    subsample = (1, 1)
-                            npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+                                    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-                            i = cuda_tensor4()
+                                    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-                            k = cuda_tensor4()
+                                    i = cuda_tensor4()
-                            t2 = None
+                                    k = cuda_tensor4()
-                            t0 = time.time()
+                                    t2 = None
-                            cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+                                    t0 = time.time()
-                            t1 = time.time()
+                                    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-                            op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=mode)(i, k)
+                                    t1 = time.time()
-                            f = theano.function([i, k], op, mode=theano_mode)
+                                    op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=mode)(i, k)
-                            for k in range(npy_kern.shape[0]):
+                                    f = theano.function([i, k], op, mode=theano_mode)
-                                for s in range(npy_kern.shape[1]):
-                                    npy_kern[k,s,:,:] = numpy.rot90(npy_kern[k,s,:,:], 2)
+                                    npy_kern = npy_kern[:,:,::-1,::-1]
-                            gpuval = f(npy_img, npy_kern)
+                                    gpuval = f(npy_img, npy_kern)
-                            t2 = time.time()
+                                    t2 = time.time()
-                            gpuval = numpy.asarray(gpuval)
+                                    gpuval = numpy.asarray(gpuval)
-                            rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
+                                    rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
-                            assert (rval == True)
+                                    assert (rval == True)
-                            print 'Test Passed'
+                                    print 'Test Passed'
 def benchmark():