Don't make the gpu backend depend on cudnn.

Add some tests for the new op.

Don't make the gpu backend depend on cudnn.
4a8da2e8 · Arnaud Bergeron · 6ad2afec · 4a8da2e8 · 4a8da2e8 · 4a8da2e8
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -33,10 +33,6 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))

-AddConfigVar('cudnn.lib',
-             """Name of the cuda dnn library for the linker.""",
-             StrParam('cudnn'))
-
 #is_nvcc_available called here to initialize global vars in
 #nvcc_compiler module
 nvcc_compiler.is_nvcc_available()
@@ -160,7 +156,7 @@ if compile_cuda_ndarray and cuda_available:
                            code,
                            location=cuda_ndarray_loc,
                            include_dirs=[cuda_path],
-                            libs=[config.cublas.lib, config.cudnn.lib],
+                            libs=[config.cublas.lib],
                            preargs=['-O3'] + compiler.compile_args())
                    from cuda_ndarray.cuda_ndarray import *
            except Exception, e:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -42,7 +42,6 @@
 #endif

 cublasHandle_t handle = NULL;
-cudnnHandle_t dnn_handle = NULL;

 /////////////////////////
 // Alloc and Free
@@ -3052,8 +3051,6 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)

 static int cublas_init();
 static void cublas_shutdown();
-static int cudnn_init();
-static void cudnn_shutdown();
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
 // If provided, it sets that device to be the active device.
@@ -3120,8 +3117,6 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        }
        if (cublas_init() == -1)
            return NULL;
-        if (cudnn_init() == -1)
-            return NULL;
    }

    Py_INCREF(Py_None);
@@ -3152,7 +3147,6 @@ CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
-    cudnn_shutdown();
    cublas_shutdown();
    cudaThreadExit();
    g_gpu_context_active = 0; // context has now been closed down
@@ -3613,28 +3607,6 @@ cublas_shutdown()
    handle = NULL;
 }

-static int
-cudnn_init()
-{
-    cudnnStatus_t err;
-    err = cudnnCreate(&dnn_handle);
-    if (err != CUDNN_STATUS_SUCCESS) {
-        PyErr_Format(PyExc_RuntimeError,
-                     "Error initializing cudnn %d", err);
-        return -1;
-    }
-    cudnnSetStream(dnn_handle, NULL);
-    return 0;
-}
-
-static void
-cudnn_shutdown()
-{
-    if (dnn_handle != NULL)
-        cudnnDestroy(dnn_handle);
-    handle = NULL;
-}
-
 int
 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
 {

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -43,7 +43,6 @@


 #include <cublas_v2.h>
-#include <cudnn.h>

 #ifdef _WIN32
 #ifdef _CUDA_NDARRAY_C
@@ -86,8 +85,6 @@ typedef float real;

 /* Use this handle to make cublas calls */
 extern DllExport cublasHandle_t handle;
-/* and this for cudnn calls */
-extern DllExport cudnnHandle_t dnn_handle;

 /**
 * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -29,8 +29,15 @@ class GpuDnnConv(GpuOp):

        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])

+    def c_headers(self):
+        return ['cudnn.h']
+
+    def c_libraries(self):
+        return ['cudnn']
+
    def c_support_code_struct(self, node, struct_id):
        return """
+cudnnHandle_t handle%(id)d;
 cudnnTensor4dDescriptor_t input%(id)d;
 cudnnTensor4dDescriptor_t output%(id)d;
 cudnnFilterDescriptor_t kerns%(id)d;
@@ -39,6 +46,10 @@ cudnnConvolutionDescriptor_t op%(id)d;

    def c_init_code_struct(self, node, struct_id, sub):
        return """
+if (cudnnCreate(&handle%(id)d) != CUDNN_STATUS_SUCCESS) {
+  PyErr_SetString(PyExc_RuntimeError, "could not create cudnn handle");
+  %(fail)s
+}
 if (cudnnCreateTensor4dDescriptor(&input%(id)d) != CUDNN_STATUS_SUCCESS) {
  PyErr_SetString(PyExc_MemoryError, "could not allocate tensor4d descriptor (inp)");
  %(fail)s
@@ -60,13 +71,10 @@ if (cudnnCreateConvolutionDescriptor(&op%(id)d) != CUDNN_STATUS_SUCCESS) {
    def c_cleanup_code_struct(self, node, struct_id):
        return """
 cudnnDestroyTensor4dDescriptor(input%(id)d);
-input%(id)d = NULL;
 cudnnDestroyTensor4dDescriptor(output%(id)d);
-output%(id)d = NULL;
 cudnnDestroyFilterDescriptor(kerns%(id)d);
-kerns%(id)d = NULL;
 cudnnDestroyConvolutionDescriptor(op%(id)d);
-op%(id)d = NULL;
+cudnnDestroy(handle%(id)d);
 """ % dict(id=struct_id)

    def c_code(self, node, name, inputs, outputs, sub):
@@ -162,7 +170,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  %(fail)s
 }
 err%(name)s = cudnnConvolutionForward(
-dnn_handle,
+handle%(id)d,
 input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
 kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
 op%(id)d,
@@ -176,9 +184,13 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
 """ % dict(img=img, kerns=kern, out=out, bmode=bmode,
           fail=sub['fail'], id=sub['struct_id'], name=name)

-from theano.sandbox.cuda.opt import local_optimizer, gpu_contiguous, register_opt
+    def c_code_cache_version(self):
+        return (0,)
+
+
+from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
+                                     gpu_optimizer)

-@register_opt()
 @local_optimizer([GpuConv])
 def local_conv_dnn(node):
    if isinstance(node.op, GpuConv):
@@ -189,3 +201,5 @@ def local_conv_dnn(node):
        border_mode = node.op.border_mode
        return [GpuDnnConv(border_mode)(gpu_contiguous(img),
                                        gpu_contiguous(kern))]
+
+gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -26,6 +26,8 @@ from theano.sandbox import cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

+from theano.sandbox.cuda.dnn import GpuDnnConv
+
 #needed as the gpu conv don't have a perform implementation.
 if theano.config.mode == 'FAST_COMPILE':
    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -615,14 +617,13 @@ def test_valid_9_10():
              print_=print_, ones=ones, rtol=1.1e-5)


-def test_valid(conv_gemm=False):
+def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
    seed_rng()
    shapes = get_valid_shapes()

    #shapes=shapes[400:426]
    # I put -1 in case we forget to add version in the test to.
    # I put -2 to test the reference version.
-    version = [-2, -1, 6]
    verbose = 0

    random = True
@@ -631,28 +632,31 @@ def test_valid(conv_gemm=False):
    if ones:
        random = False

-    if conv_gemm:
-        # Test the GpuCorrMM version
-        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.BaseGpuCorrMM
-        # dummy version; not used by GpuCorrMM so one version is enough
-        version = [-1]
-        # Add tests with strided inputs by still square images and filters.
-        shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-        shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
-    else:
-        mode = theano_mode
-        cls = None
+    shapes += extra_shapes
+
    exec_conv(version, shapes, verbose, random, 'valid',
              print_=print_, ones=ones, rtol=1.1e-5,
              theano_mode=mode, cls=cls)


+def test_valid():
+    _test_valid(None, version=[-2, -1, 6])
+
+
 def test_gemm_valid():
-    test_valid(conv_gemm=True)
+    extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
+    extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
+
+    _test_valid(cuda.blas.BaseGpuCorrMM,
+                mode=theano_mode.including("conv_gemm"),
+                extra_shapes=extra_shapes)
+

+def test_dnn_valid():
+    _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))

-def test_full(conv_gemm=False):
+
+def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
    seed_rng()
    shapes = get_basic_shapes()
    shapes += get_shapes2()
@@ -707,25 +711,26 @@ def test_full(conv_gemm=False):
            ]

 #    shapes=shapes[:277]
-    version = [-2, -1, 0, 1, 2, 3, 4, 5]
    verbose = 0
    random = True

-    if conv_gemm:
-        # Test the GpuCorrMM version
-        mode = theano_mode.including("conv_gemm")
-        cls = cuda.blas.BaseGpuCorrMM
-        # dummy version; not used by GpuCorrMM so one version is enough
-        version = [-1]
-    else:
-        mode = theano_mode
-        cls = None
+    shapes += extra_shapes
+
    exec_conv(version, shapes, verbose, random, 'full',
              theano_mode=mode, cls=cls)


+def test_full():
+    _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
+
+
 def test_gemm_full():
-    test_full(conv_gemm=True)
+    _test_full(cuda.blas.BaseGpuCorrMM,
+               mode=theano_mode.including("conv_gemm"))
+
+
+def test_dnn_full():
+    _test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))


 def test_subsample(conv_gemm=False):