caffe conv kernel for theano. tests work, but needs integration and some cleanup

fb660352 · Arjun Jain · 87d3e7c1 · fb660352 · fb660352 · fb660352
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -497,6 +497,246 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)


+class GpuConvMM(GpuOp):
+    """
+    Author: Arjun Jain
+    Implement the caffe convolution
+    """
+    @staticmethod
+    def logical_output_shape_2d(imshp, kshp, mode):
+        if mode == 'valid':
+            return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
+        if mode == 'full':
+            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
+        raise ValueError(mode)
+
+    def __init__(self, border_mode,
+            subsample=(1, 1),
+            logical_img_hw=None,
+            logical_kern_hw=None,
+            logical_kern_align_top=True,
+            version=-1,
+            verbose=0,
+            kshp=None,
+            imshp=None,
+            max_threads_dim0=None, pad=0):
+        """
+        :param version: each version of c_code implements many kernel for the
+                        convolution. By default we try to guess the best one.
+                        You can force one version with this parameter. This
+                        parameter is used by the tests.
+        :param verbose: for value of 1,2 and 3. Print more information during
+                        the execution of the convolution. Mostly used for
+                        optimization or debugging.
+        :param kshp:    The size of the kernel. If provided, can generate
+                        faster code. If the GpuConv op is automatically
+                        inserted,
+                        we take its value automatically from the Conv op.
+        :param imshp:   The size of the image. Not used for code generation but
+                        allows to select an experimental new version in another
+                        repo.
+        :param max_threads_dim0: The maximum number of threads for the
+                        block size dimensions 0 (blockDim.x) used by the
+                        GPU function.
+
+        """
+        self.border_mode = border_mode
+        self.subsample = subsample
+        if logical_img_hw is not None:
+            h, w = logical_img_hw
+            #TODO: reconsider this... since shapes are not given in
+            # constructor, maybe a multiplier + offset is a more
+            # appropriate way of passing this logical grid
+            logical_img_hw = tuple(logical_img_hw)
+        self.logical_img_hw = logical_img_hw
+        if logical_kern_hw is not None:
+            h, w = logical_kern_hw
+            #TODO: reconsider this... since shapes are not given in
+            # constructor, maybe a multiplier + offset is a more
+            # appropriate way of passing this logical grid
+            logical_kern_hw = tuple(logical_kern_hw)
+        self.logical_kern_hw = logical_kern_hw
+        self.logical_kern_align_top = logical_kern_align_top
+        self.version = version
+        self.verbose = verbose
+        self.kshp = kshp
+        self.imshp = imshp
+        self.max_threads_dim0 = max_threads_dim0
+        self.pad = pad
+
+    def __eq__(self, other):
+        return type(self) == type(other) \
+            and self.border_mode == other.border_mode \
+            and self.subsample == other.subsample \
+            and self.logical_img_hw == other.logical_img_hw \
+            and self.logical_kern_hw == other.logical_kern_hw \
+            and self.logical_kern_align_top == other.logical_kern_align_top \
+            and self.version == other.version \
+            and self.verbose == other.verbose \
+            and self.kshp == other.kshp\
+            and self.imshp == other.imshp\
+            and self.max_threads_dim0 == other.max_threads_dim0
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, "imshp"):
+            self.imshp = None
+        if not hasattr(self, "max_threads_dim0"):
+            self.max_threads_dim0 = None
+
+    def __hash__(self):
+        # don't use hash(self.version) as hash(-1)==-2 and
+        # hash(-2)==-2 in python!
+        return hash(type(self)) \
+            ^ hash(self.border_mode) \
+            ^ hash(self.subsample) \
+            ^ hash(self.logical_img_hw) \
+            ^ hash(self.logical_kern_hw) \
+            ^ hash(self.logical_kern_align_top) \
+            ^ self.version \
+            ^ hash(self.verbose) \
+            ^ hash(self.kshp)\
+            ^ hash(self.imshp)\
+            ^ hash(self.max_threads_dim0)
+
+    def __str__(self):
+        return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
+            self.__class__.__name__,
+            self.border_mode,
+            str(self.subsample),
+            str(self.logical_img_hw),
+            str(self.logical_kern_hw),
+            str(self.logical_kern_align_top),
+            str(self.imshp),
+            str(self.kshp))
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
+                         False, False]
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def flops(self, inputs, outputs):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        images, kerns = inputs
+        out, = outputs
+        assert images[1] == kerns[1]
+        flops = 0
+        if self.border_mode == "valid":
+            # nb mul and add by output pixel
+            flops = kerns[2] * kerns[3] * 2
+            # nb flops by output image
+            flops *= out[2] * out[3]
+            # nb patch multiplied
+            flops *= images[1] * kerns[0] * images[0]
+        else:
+            flops = (images[0] * kerns[0] * images[1] *
+                     kerns[2] * kerns[3] *
+                     images[2] * images[3] * 2)
+        return flops
+
+    def make_thunk(self, node, storage_map, compute_map, no_recycling):
+        node_ = copy.copy(node)
+        assert node.op is node_.op
+        if node_.op.max_threads_dim0 is None:
+            cuda = theano.sandbox.cuda
+            device_id = cuda.use.device_number
+            if device_id is None:
+                cuda.use("gpu",
+                         force=False,
+                         default_to_move_computation_to_gpu=False,
+                         move_shared_float32_to_gpu=False,
+                         enable_cuda=False,
+                         test_driver=True)
+                device_id = cuda.use.device_number
+            cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+            prop = cuda_ndarray.device_properties(device_id)
+            node_.op.max_threads_dim0 = prop['maxThreadsDim0']
+        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
+                                                   compute_map, no_recycling)
+
+    def c_compile_args(self):
+        nb = 0
+        if self.kshp is not None:
+            nb = self.kshp[1]
+        return ['-DTHEANO_KERN_WID=' + str(nb), '-g'] # ,'-g','-G']
+
+    def c_headers(self):
+        return ['cuda_ndarray.cuh', '<stdio.h>']
+
+    def c_code_cache_version(self):
+        # raise this whenever modifying any of the support_code_files
+        return (0, 21)
+
+    def c_support_code_apply(self, node, nodename):
+        # REMEMBER TO RAISE c_code_cache_version when changing any of
+        # these files
+        files = [ 'conv_gemm.cu']
+        codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
+                for f in files]
+        return reduce(str.__add__, codes)
+
+    def c_code(self, node, nodename, inp, out_, sub):
+        img, kern = inp
+        out, = out_
+        dx = self.subsample
+        dy = self.subsample
+        border_mode = self.border_mode
+        version = self.version
+        verbose = self.verbose
+        sub = sub.copy()
+        max_threads_dim0 = self.max_threads_dim0
+        pad = self.pad
+
+        if max_threads_dim0 is None:
+            raise NotImplementedError("GpuConv.c_code should not be called "
+                                      "directly. It should be called by "
+                                      "make_thunk() that add some information "
+                                      "related to the selected GPU.")
+        sub.update(locals())
+        
+        return """
+    //Mandatory args
+    const char *mode_str = "%(border_mode)s";
+
+    //Optional args
+    int version = %(version)s;
+    int verbose = %(verbose)s;
+    int dx = %(dx)s;
+    int dy = %(dy)s;
+
+    int mode;
+    if (strcmp(mode_str, "full") == 0)
+    {
+        mode = 0;
+    }
+    else if (strcmp(mode_str, "valid") == 0)
+    {
+        mode = 1;
+    }
+    else
+    {
+        PyErr_SetString(PyExc_ValueError,
+                        "mode must be one of 'full' or 'valid'");
+        return NULL;
+    }
+    //TODO: Send self.pad, stride, etc
+    CudaNdarray * out2 = validMM(%(img)s, %(kern)s, %(out)s);
+    // TODO, make out be decref before we alloc out2!
+    Py_XDECREF(%(out)s);
+    %(out)s = out2;
+
+    if (%(out)s==NULL){
+     %(fail)s
+    }
+
+""" % sub
+
+
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/caffe_common.hpp
+++ b/theano/sandbox/cuda/caffe_common.hpp
+// Copyright 2014 BVLC and contributors.
+
+#ifndef CAFFE_COMMON_HPP_
+#define CAFFE_COMMON_HPP_
+
+//#include <boost/shared_ptr.hpp>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <curand.h>
+#include <driver_types.h>  // cuda driver types
+//#include <glog/logging.h>
+
+// CUDA: grid stride looping
+#define CUDA_KERNEL_LOOP(i, n) \
+for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+i < (n); \
+i += blockDim.x * gridDim.x)
+
+// CUDA: thread number configuration.
+// Use 1024 threads per block, which requires cuda sm_2x or above,
+// or fall back to attempt compatibility (best of luck to you).
+#if __CUDA_ARCH__ >= 200
+    const int CAFFE_CUDA_NUM_THREADS = 1024;
+#else
+    const int CAFFE_CUDA_NUM_THREADS = 512;
+#endif
+
+// CUDA: number of blocks for threads.
+inline int CAFFE_GET_BLOCKS(const int N) {
+  return (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS;
+}
+
+#endif  // CAFFE_COMMON_HPP_
--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
+// Copyright 2014 BVLC and contributors.
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#include <Python.h>
+#include "cuda_ndarray.cuh"
+#include "caffe_common.hpp"
+// Author: Arjun Jain
+// Kernel for fast unfold+copy
+// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+__global__ void im2col_kernel(const int n, const float* data_im,
+                              const int height, const int width, const int ksize, const int pad,
+                              const int stride, const int height_col, const int width_col,
+                              float* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * ksize * ksize;
+    int h_in = h_out * stride - pad;
+    int w_in = w_out * stride - pad;
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    data_im += (channel_in * height + h_in) * width + w_in;
+    for (int i = 0; i < ksize; ++i) {
+      for (int j = 0; j < ksize; ++j) {
+        int h = h_in + i;
+        int w = w_in + j;
+        *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
+                                             data_im[i * width + j] : 0;
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+void im2col(const float* data_im, const int channels,
+            const int height, const int width, const int ksize, const int pad,
+            const int stride, float* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int height_col = (height + 2 * pad - ksize) / stride + 1;
+  int width_col = (width + 2 * pad - ksize) / stride + 1;
+  int num_kernels = channels * height_col * width_col;
+    
+  // Launch
+  im2col_kernel <<<CAFFE_GET_BLOCKS(num_kernels), CAFFE_CUDA_NUM_THREADS>>> (
+     num_kernels, data_im, height, width, ksize, 
+     pad, stride, 
+     height_col, width_col, data_col
+     );
+}
+
+
+
+CudaNdarray* validMM(const CudaNdarray *input, 
+				      CudaNdarray *weight,
+				      CudaNdarray *output) 
+{
+
+    // TODO: This needs to be done in the singleton!
+    // Initialize CUBLAS
+    cublasHandle_t handle;
+  	cublasStatus_t status = cublasCreate(&handle);
+  	if (status != CUBLAS_STATUS_SUCCESS) {
+      		std::cerr << "!!!! CUBLAS initialization error\n";
+	}
+
+    if (input->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required input of 4D");
+    }
+    
+    if (weight->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required weight of 4D");
+    }
+        
+     // Reference code: https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
+     // TODO: stride(dW, dH) and padding as function parameter
+     int dH = 1; 
+     int dW = 1;
+     int padding = 0; 
+     int kH = CudaNdarray_HOST_DIMS(weight)[2];
+     int kW = CudaNdarray_HOST_DIMS(weight)[3];
+     int nInputPlane = CudaNdarray_HOST_DIMS(input)[1]; 
+     // filters: (number of filters, nInputPlane, rows, columns)
+     int nOutputPlane = CudaNdarray_HOST_DIMS(weight)[0];
+     long batchSize = CudaNdarray_HOST_DIMS(input)[0];
+     assert(kW == kH); //filters must be square (kW == kH)
+     assert(dW == dH); //stride must be square (dW == dH)
+     long inputHeight  = CudaNdarray_HOST_DIMS(input)[2];
+     long inputWidth   = CudaNdarray_HOST_DIMS(input)[3];
+     long outputWidth  = (inputWidth + 2*padding - kW) / dW + 1;
+     long outputHeight = (inputHeight + 2*padding - kH) / dH + 1;
+     // Allocate output, size (batchSize, nOutputPlane, 
+     //		outputHeight, outputWidth);
+     int out_dim[4];
+     out_dim[0] = batchSize;
+     out_dim[1] = nOutputPlane;
+     out_dim[2] = outputHeight;
+     out_dim[3] = outputWidth;
+     
+     output = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+     // Create temporary columns
+     int col_dim[2];
+     col_dim[0] = nInputPlane*kW*kH;
+     col_dim[1]= outputHeight*outputWidth;
+     CudaNdarray* columns = (CudaNdarray*)CudaNdarray_NewDims(2,col_dim);
+
+
+      int ip_stride = CudaNdarray_HOST_DIMS(input)[1] *
+        CudaNdarray_HOST_DIMS(input)[2] * 
+        CudaNdarray_HOST_DIMS(input)[3];
+        
+     int op_stride = CudaNdarray_HOST_DIMS(output)[1] *
+        CudaNdarray_HOST_DIMS(output)[2] * 
+        CudaNdarray_HOST_DIMS(output)[3];
+
+     // For each elt in batch, do:
+     for (int elt = 0; elt < batchSize; elt ++) {
+     // Matrix mulitply per output:
+       
+        // 1. Extract columns:
+        im2col(
+           input->devdata + elt*ip_stride,
+           nInputPlane, inputWidth, inputHeight, kW, padding, dW, 
+           columns->devdata
+         );
+      
+         
+	  // M,N,K are dims of matrix A and B
+	    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+
+		float alpha = 1.0f; float beta = 0.0f;
+         int m = CudaNdarray_HOST_DIMS(columns)[1];
+         int n = CudaNdarray_HOST_DIMS(weight)[1];
+         int k = CudaNdarray_HOST_DIMS(columns)[0];
+         
+         //Caffe::getRef().getCublasHandle().get()
+         status = cublasSgemm(handle,
+                CUBLAS_OP_N, CUBLAS_OP_N,
+                m, n, k,
+                &alpha,
+                columns->devdata, m,
+                weight->devdata, k,
+                &beta,
+                output->devdata + elt * op_stride, m
+                );
+
+  
+		  cudaError_t err = cudaGetLastError();
+		  if (err != cudaSuccess) {
+		    printf("error in validMM: %s\n", cudaGetErrorString(err));
+		  }
+
+      }
+    
+    // TODO: How is columns and output deallocated? 
+    // device_free(columns->devdata);
+    // TODO: I did not kill the cublas context. If it comes from 
+    // the singleton, we dont need to kill it.
+
+ 
+  return output;
+}
+               
+               
+				      
+    
\ No newline at end of file
--- a/theano/sandbox/cuda/tests/test_conv_gemm.py
+++ b/theano/sandbox/cuda/tests/test_conv_gemm.py
+"""
+Tests for GPU convolution
+"""
+import sys
+import time
+import unittest
+import matplotlib.pyplot as plt
+
+
+import numpy
+
+from nose.plugins.skip import SkipTest
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import correlate
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
+
+
+import theano
+from theano import tensor
+from theano.gof.python25 import any
+from theano.tests.unittest_tools import seed_rng
+
+# Skip test if cuda_ndarray is not available.
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+
+#needed as the gpu conv don't have a perform implementation.
+if theano.config.mode == 'FAST_COMPILE':
+    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    theano_mode = theano.compile.mode.get_default_mode().including('gpu')
+
+cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
+cuda_tensor2 = cuda_ndarray.CudaNdarrayType([False] * 2)
+device_id = theano.sandbox.cuda.use.device_number
+if device_id is None:
+    cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
+device_id = theano.sandbox.cuda.use.device_number
+if device_id is None:
+    cuda.use("gpu",
+             force=False,
+             default_to_move_computation_to_gpu=False,
+             move_shared_float32_to_gpu=False,
+             enable_cuda=False,
+             test_driver=True)
+    device_id = theano.sandbox.cuda.use.device_number
+cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
+device_prop = cuda_ndarray.device_properties(device_id)
+
+  
+def py_corr_scipy(img, kern, mode, subsample):
+    assert img.shape[1] == kern.shape[1]
+    if mode == 'valid':
+        outshp = (img.shape[0], kern.shape[0],
+                img.shape[2] - kern.shape[2] + 1,
+                img.shape[3] - kern.shape[3] + 1)
+    else:
+        outshp = (img.shape[0], kern.shape[0],
+                img.shape[2] + kern.shape[2] - 1,
+                img.shape[3] + kern.shape[3] - 1)
+    out = numpy.zeros(outshp, dtype='float32')
+    for b in xrange(out.shape[0]):
+        for k in xrange(out.shape[1]):
+            for s in xrange(img.shape[1]):
+                out[b, k, :, :] += correlate(img[b, s, :, :],
+                                              kern[k, s, :, :],
+                                              mode)
+    return out
+
+
+def _params_allgood_header():
+    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
+     
+kH = 3
+kW = 3
+nInputPlane = 3 #channels
+nOutputPlane = 2
+padding = 0
+batchSize = 4 
+
+
+
+inputWidth   = 7 #im.shape[1] 
+inputHeight  = 7 #im.shape[0] 
+ishape = (batchSize, nInputPlane, inputHeight, inputWidth)
+kshape = (nOutputPlane, nInputPlane, kH, kW)
+print 'Image shape', ishape
+print 'Kernel shape', kshape
+
+im = numpy.random.rand(*ishape) + 1
+#plt.imread('lena.bmp')
+
+img_stride = (1, 1)
+kern_stride = (1, 1)
+outputWidth  = (inputWidth + 2*padding - kW) / img_stride[1] + 1
+outputHeight = (inputHeight + 2*padding - kH) / img_stride[0] + 1
+oshape=(batchSize, nInputPlane, outputHeight, outputWidth)
+
+npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
+                          dtype='float32')
+
+npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
+                           dtype='float32')
+                
+img = cuda_ndarray.CudaNdarray(npy_img)
+kern = cuda_ndarray.CudaNdarray(npy_kern)
+
+ #temporary columns
+cshape = (nInputPlane*kW*kH, outputHeight*outputWidth)
+print 'Columns shape: ', cshape
+oshape=(batchSize, nInputPlane, outputHeight, outputWidth)
+print 'Output shape: ', oshape
+
+subsample = 1
+mode = 'valid'
+t0 = time.time()
+cpuval = py_corr_scipy(npy_img, npy_kern, mode, subsample)
+t1 = time.time()
+i = cuda_tensor4()
+k = cuda_tensor4()
+
+
+op = theano.sandbox.cuda.blas.GpuConvMM(border_mode=mode,
+                                      subsample=(subsample, subsample),
+                                      version=100,
+                                      verbose=2, pad=1)(i, k)
+                                      
+f = theano.function([i, k], op, mode=theano_mode)
+gpuval = f(img, kern)
+t2 = time.time()
+gpuval = numpy.asarray(gpuval)
+
+
+if gpuval.shape != cpuval.shape:
+    print >> sys.stdout, "ERROR: shape mismatch",
+    print >> sys.stdout, gpuval.shape, cpuval.shape
+     
+print '---------------- INPUT VAL -----------------------'
+print npy_img
+print '---------------- kernel -----------------------'
+print npy_kern
+print '---------------- GPU VAL -----------------------'
+print gpuval
+print '---------------- CPU VAL -----------------------'
+print cpuval
+rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
+print rval
+assert numpy.all(numpy.isfinite(gpuval))
\ No newline at end of file