Convolution layer using cudnn.

66e2fcef · Arnaud Bergeron · ee3e49e4 · 66e2fcef · 66e2fcef · 66e2fcef
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -33,6 +33,10 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))

+AddConfigVar('cudnn.lib',
+             """Name of the cuda dnn library for the linker.""",
+             StrParam('cudnn'))
+
 #is_nvcc_available called here to initialize global vars in
 #nvcc_compiler module
 nvcc_compiler.is_nvcc_available()
@@ -155,7 +159,8 @@ if compile_cuda_ndarray and cuda_available:
                            'cuda_ndarray',
                            code,
                            location=cuda_ndarray_loc,
-                            include_dirs=[cuda_path], libs=[config.cublas.lib],
+                            include_dirs=[cuda_path],
+                            libs=[config.cublas.lib, config.cudnn.lib],
                            preargs=['-O3'] + compiler.compile_args())
                    from cuda_ndarray.cuda_ndarray import *
            except Exception, e:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -42,6 +42,7 @@
 #endif

 cublasHandle_t handle = NULL;
+cudnnHandle_t dnn_handle = NULL;

 /////////////////////////
 // Alloc and Free
@@ -3051,6 +3052,8 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)

 static int cublas_init();
 static void cublas_shutdown();
+static int cudnn_init();
+static void cudnn_shutdown();
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
 // If provided, it sets that device to be the active device.
@@ -3117,6 +3120,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        }
        if (cublas_init() == -1)
            return NULL;
+        if (cudnn_init() == -1)
+            return NULL;
    }

    Py_INCREF(Py_None);
@@ -3147,6 +3152,7 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
+    cudnn_shutdown();
    cublas_shutdown();
    cudaThreadExit();
    g_gpu_context_active = 0; // context has now been closed down
@@ -3607,6 +3613,28 @@ cublas_shutdown()
    handle = NULL;
 }

+static int
+cudnn_init()
+{
+    cudnnStatus_t err;
+    err = cudnnCreate(&dnn_handle);
+    if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Error initializing cudnn %d", err);
+        return -1;
+    }
+    cudnnSetStream(dnn_handle, NULL);
+    return 0;
+}
+
+static void
+cudnn_shutdown()
+{
+    if (dnn_handle != NULL)
+        cudnnDestroy(dnn_handle);
+    handle = NULL;
+}
+
 int
 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
 {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -43,6 +43,7 @@


 #include <cublas_v2.h>
+#include <cudnn.h>

 #ifdef _WIN32
 #ifdef _CUDA_NDARRAY_C
@@ -85,6 +86,8 @@ typedef float real;

 /* Use this handle to make cublas calls */
 extern DllExport cublasHandle_t handle;
+/* and this for cudnn calls */
+extern DllExport cudnnHandle_t dnn_handle;

 /**
 * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
+import copy
+import os
+
+import theano
+from theano import Apply
+from theano import tensor
+from theano.compat.six import StringIO
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
+                                           gpu_contiguous)
+from theano.sandbox.cuda.blas import GpuConv
+
+class GpuDnnConv(GpuOp):
+    __props__ = ('border_mode',)
+
+    def __init__(self, border_mode):
+        self.border_mode = border_mode
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if kern.type.ndim != 4:
+            raise TypeError('kern must be 4D tensor')
+
+        broadcastable = (img.type.broadcastable[0],
+                         kern.type.broadcastable[0],
+                         False, False)
+
+        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+
+    def c_support_code_struct(self, node, struct_id):
+        return """
+cudnnTensor4dDescriptior_t input%(id)d = NULL;
+cudnnTensor4dDescriptior_t output%(id)d = NULL;
+cudnnFilterDescriptor_t kerns%(id)d = NULL;
+cudnnConvolutionDescriptor_t op%(id)d = NULL;
+""" % dict(id=struct_id)
+
+    def c_init_code_struct(self, node, struct_id, sub):
+        return """
+if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)");
+  %(fail)s
+}
+if (cudnnCreateTensor4dDescriptor(&output%(id)d) != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (out)");
+  %(fail)s
+}
+if (cudnnCreateFilterDescriptor(&kerns%(id)d) != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_MemoryError, "could not allocate filter descriptor");
+  %(fail)s
+}
+if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_MemoryError, "could not allocate convolution descriptor");
+  %(fail)s
+}
+""" % dict(id=struct_id, fail=sub['fail'])
+
+    def c_cleanup_code_struct(self, node, struct_id):
+        return """
+cudnnDestroyTensor4dDescriptor(input%(id)d);
+input%(id)d = NULL;
+cudnnDestroyTensor4dDescriptor(output%(id)d);
+output%(id)d = NULL;
+cudnnDestroyFilterDescriptor(kerns%(id)d);
+kerns%(id)d = NULL;
+cudnnDestroyConvolutionDescriptor(op%(id)d);
+op%(id)d = NULL;
+""" % dict(id=struct_id)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        img, kern = inputs
+        out, = outputs
+
+        if self.border_mode == "valid":
+            bmode = 1
+        else:
+            assert self.border_mode == "full"
+            bmode = 0
+
+        return """
+cudnnStatus_t err%(name)s;
+int pad_w%(name)s;
+int pad_h%(name)s;
+err%(name)s = cudnnSetTensor4dDescriptorEx(
+input%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(img)s)[0],
+CudaNdarray_HOST_DIMS(%(img)s)[1],
+CudaNdarray_HOST_DIMS(%(img)s)[2],
+CudaNdarray_HOST_DIMS(%(img)s)[3],
+CudaNdarray_HOST_STRIDES(%(img)s)[0],
+CudaNdarray_HOST_STRIDES(%(img)s)[1],
+CudaNdarray_HOST_STRIDES(%(img)s)[2],
+CudaNdarray_HOST_STRIDES(%(img)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not set tensor4d descriptor");
+  %(fail)s
+}
+// TODO: make sure the kernels are contiguous or ... BOOM!
+err%(name)s = cudnnSetFilterDescriptor(
+kerns%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(kerns)s)[0],
+CudaNdarray_HOST_DIMS(%(kerns)s)[1],
+CudaNdarray_HOST_DIMS(%(kerns)s)[2],
+CudaNdarray_HOST_DIMS(%(kerns)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not set filter descriptor");
+  %(fail)s
+}
+if (%(bmode)d == 1) {
+  pad_h%(name)s = 0;
+  pad_w%(name)s = 0;
+} else if (%(bmode)d == 0) {
+  pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
+  pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
+} else {
+  PyErr_SetString(PyExc_ValueError, "bad border mode");
+  %(fail)s
+}
+err%(name)s = cudnnSetConvolutionDescriptor(
+op%(id)d, input%(id)d, kerns%(id)d, 
+pad_h%(name)s,
+pad_w%(name)s,
+1, 1, 1, 1,
+CUDNN_CONVOLUTION
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not set op descriptor");
+  %(fail)s
+}
+{
+int out_dims[4];
+err%(name)s = cudnnGetOutputTensor4dDim(
+op%(id)d, CUDNN_CONVOLUTION_FWD,
+&out_dims[0], &out_dims[1],
+&out_dims[2], &out_dims[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not set op descriptor");
+  %(fail)s
+}
+if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
+  %(fail)s
+}
+}
+err%(name)s = cudnnSetTensor4DescriptorEx(
+output%(id)d, CUDNN_DATA_FLOAT,
+CudaNdarray_HOST_DIMS(%(out)s)[0],
+CudaNdarray_HOST_DIMS(%(out)s)[1],
+CudaNdarray_HOST_DIMS(%(out)s)[2],
+CudaNdarray_HOST_DIMS(%(out)s)[3],
+CudaNdarray_HOST_STRIDES(%(out)s)[0],
+CudaNdarray_HOST_STRIDES(%(out)s)[1],
+CudaNdarray_HOST_STRIDES(%(out)s)[2],
+CudaNdarray_HOST_STRIDES(%(out)s)[3]
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not set out descriptor");
+  %(fail)s
+}
+err%(name)s = cudnnConvolutionForward(
+dnn_handle,
+input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
+kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
+op%(id)d,
+out%(id)d, CudaNdarray_DEV_DATA(%(out)s),
+CUDNN_RESULT_NO_ACCUMULATE
+);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "error doing operation");
+  %(fail)s
+}
+""" % dict(img=img, kerns=kerns, out=out, bmode=bmode,
+           fail=sub['fail'], id=sub['struct_id'], name=name)
+
+from theano.sandbox.cuda.opt import local_optimizer, gpu_contiguous, register_opt
+
+@register_opt()
+@local_optimizer([GpuConv])
+def local_conv_dnn(node):
+    if (isinstance(node.op, GpuConv) and
+        node.op.border_mode in ['full', 'valid']):
+        if node.op.subsample != (1, 1):
+            return
+        img, kern = node.inputs
+        border_mode = node.op.border_mode
+        return [GpuDnnConv(border_mode)(gpu_contiguous(img),
+                                        gpu_contiguous(kern))]