Merge pull request #4000 from nouiz/cudnn_version

Cudnn version, print it, warn when too new, make mandatory in the new back-end.

Merge pull request #4000 from nouiz/cudnn_version
e9425be8 · Frédéric Bastien · 9f665737 · ef41de9c · e9425be8 · e9425be8
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -278,8 +278,8 @@ class GpuOp(theano.gof.Op):
    """
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        if theano.sandbox.cuda.use.device_number is None:
+        if use.device_number is None:
-            theano.sandbox.cuda.use("gpu",
+            use("gpu",
                force=True,
                default_to_move_computation_to_gpu=False,
                move_shared_float32_to_gpu=False,
@@ -299,6 +299,146 @@ from theano.sandbox.cuda.var import (CudaNdarrayVariable,
 from theano.sandbox.cuda.type import CudaNdarrayType
+def dnn_available():
+    if config.dnn.enabled == "False":
+        dnn_available.avail = False
+        dnn_available.msg = "disabled by dnn.enabled flag"
+    if dnn_available.avail is None and not cuda_available:
+        dnn_available.msg = "CUDA not available"
+        dnn_available.avail = False
+    elif dnn_available.avail is None:
+        dev = active_device_number()
+        if device_properties(dev)['major'] < 3:
+            dnn_available.msg = "Device not supported by cuDNN"
+            dnn_available.avail = False
+        else:
+            preambule = """
+#include <stdio.h>
+#include <cuda.h>
+#include <cudnn.h>
+#include <cudnn_helper.h>
+            """
+            body = """
+cudnnHandle_t _handle = NULL;
+cudnnStatus_t err;
+if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  fprintf(stderr, "could not create cuDNN handle: %s",
+          cudnnGetErrorString(err));
+  return 1;
+}
+"""
+            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+            if config.dnn.include_path:
+                params.append("-I" + config.dnn.include_path)
+            if config.dnn.library_path:
+                params.append("-L" + config.dnn.library_path)
+            if config.nvcc.compiler_bindir:
+                params.extend(['--compiler-bindir',
+                               config.nvcc.compiler_bindir])
+            # Do not run here the test program. It would run on the
+            # default gpu, not the one selected by the user. If mixed
+            # GPU are installed or if the GPUs are configured in
+            # exclusive mode, this cause bad detection.
+            comp, out, err = nvcc_compiler.NVCC_compiler.try_flags(
+                flag_list=params, preambule=preambule, body=body,
+                try_run=False, output=True)
+            dnn_available.avail = comp
+            if not dnn_available.avail:
+                dnn_available.msg = (
+                    "Theano can not compile with cuDNN. We got this error:\n" +
+                    str(err))
+            else:
+                # If we can compile, check that we can import and run.
+                v = dnn_version()
+                if isinstance(v, tuple) and v[0] != v[1]:
+                    dnn_available.avail = False
+                    dnn_available.msg = ("Mixed dnn version. The header is"
+                                         " from one version, but we link with"
+                                         " a different version %s" % str(v))
+                    raise RuntimeError(dnn_available.msg)
+                if v == -1 or v[0] < 3007:
+                    # 3007 is the final release of cudnn v3
+                    dnn_available.avail = False
+                    dnn_available.msg = (
+                        "You have an old release of CuDNN (or a release "
+                        "candidate) that isn't supported.  Please update to "
+                        "at least v3 final version.")
+                    raise RuntimeError(dnn_available.msg)
+    if config.dnn.enabled == "True":
+        if not dnn_available.avail:
+            raise RuntimeError(
+                "You enabled CuDNN, but we aren't able to use it: %s" %
+                dnn_available.msg)
+    return dnn_available.avail
+dnn_available.avail = None
+dnn_available.msg = None
+class DnnVersion(GpuOp):
+    def c_compiler(self):
+        return nvcc_compiler.NVCC_compiler
+    def c_headers(self):
+        return ['cudnn.h']
+    def c_libraries(self):
+        return ['cudnn']
+    def c_support_code(self):
+        return """
+#if PY_MAJOR_VERSION >= 3
+#define PyInt_FromLong PyLong_FromLong
+#endif
+"""
+    def make_node(self):
+        return theano.gof.Apply(self, [], [theano.gof.Generic()()])
+    def c_code(self, node, name, inputs, outputs, sub):
+        o = outputs[0]
+        return """
+        #if defined(CUDNN_VERSION)
+        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
+        #else
+        %(o)s = PyInt_FromLong(-1);
+        #endif
+        """ % locals()
+    def do_constant_folding(self, node):
+        # Needed as we do not want to cache this information.
+        return False
+    def c_code_cache_version(self):
+        # Not needed, but make it clear that we do not want to cache this.
+        return None
+def dnn_version():
+    """Return the current cuDNN version we compile with.
+    This returns a tuple with the header version and the library
+    version we link with. For older cudnn version without version
+    information, we return -1.
+    """
+    if not dnn_available():
+        raise Exception(
+            "We can't determine the cudnn version as it is not available",
+            dnn_available.msg)
+    if dnn_version.v is None:
+        f = theano.function([], DnnVersion()(),
+                            theano.Mode(optimizer=None),
+                            profile=False)
+        dnn_version.v = f()
+    return dnn_version.v
+dnn_version.v = None
 if cuda_available:
    # check if their is an old cuda_ndarray that was loading instead of the one
    # we compiled!
@@ -451,9 +591,36 @@ def use(device,
                                 " this property")
            if config.print_active_device:
-                cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
+                if config.lib.cnmem:
-                print("Using gpu device %d: %s (CNMeM is %s)" % (
+                    if config.lib.cnmem > 1:
-                        active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
+                        cnmem_enabled = "enabled with initial size: %d MB" % config.lib.cnmem
+                    else:
+                        cnmem = min(config.lib.cnmem, 0.98)
+                        cnmem_enabled = "enabled with initial size: %.2f%% of memory" % cnmem
+                else:
+                    cnmem_enabled = "disabled"
+                cudnn_version = "not available"
+                warn = None
+                try:
+                    (hdr_v, runtime_v) = dnn_version()
+                    cudnn_version = runtime_v
+                    # 4100 should not print warning with cudnn 4 final.
+                    if cudnn_version > 4100:
+                        warn = ("Your CuDNN version is more recent then Theano."
+                                " If you see problems, try updating Theano or"
+                                " downgrading CuDNN to version 4.")
+                except Exception:
+                    pass
+                print("Using gpu device %d: %s (CNMeM is %s, CuDNN %s)" % (
+                    active_device_number(),
+                    active_device_name(),
+                    cnmem_enabled,
+                    cudnn_version,),
+                      file=sys.stderr)
+                if warn:
+                    import warnings
+                    warnings.warn(warn)
            if device_properties(use.device_number)['regsPerBlock'] < 16384:
                # We will try to use too much register per bloc at many places
                # when there is only 8k register per multi-processor.

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -7,7 +7,7 @@ from theano import Apply, tensor, config, Variable
 from theano.scalar import as_scalar, constant, Log
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
-from theano.gof.type import CDataType, Generic
+from theano.gof.type import CDataType
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
@@ -16,7 +16,8 @@ from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda import GpuOp, dnn_available
+from theano.sandbox.cuda import dnn_version as version
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
@@ -35,85 +36,6 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradInputs)
-def dnn_available():
-    if config.dnn.enabled == "False":
-        dnn_available.avail = False
-        dnn_available.msg = "disabled by dnn.enabled flag"
-    if dnn_available.avail is None and not theano.sandbox.cuda.cuda_available:
-        dnn_available.msg = "CUDA not available"
-        dnn_available.avail = False
-    elif dnn_available.avail is None:
-        dev = theano.sandbox.cuda.active_device_number()
-        if theano.sandbox.cuda.device_properties(dev)['major'] < 3:
-            dnn_available.msg = "Device not supported by cuDNN"
-            dnn_available.avail = False
-        else:
-            preambule = """
-#include <stdio.h>
-#include <cuda.h>
-#include <cudnn.h>
-#include <cudnn_helper.h>
-            """
-            body = """
-cudnnHandle_t _handle = NULL;
-cudnnStatus_t err;
-if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
-  fprintf(stderr, "could not create cuDNN handle: %s",
-          cudnnGetErrorString(err));
-  return 1;
-}
-"""
-            params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
-            if config.dnn.include_path:
-                params.append("-I" + config.dnn.include_path)
-            if config.dnn.library_path:
-                params.append("-L" + config.dnn.library_path)
-            if config.nvcc.compiler_bindir:
-                params.extend(['--compiler-bindir',
-                               config.nvcc.compiler_bindir])
-            # Do not run here the test program. It would run on the
-            # default gpu, not the one selected by the user. If mixed
-            # GPU are installed or if the GPUs are configured in
-            # exclusive mode, this cause bad detection.
-            comp, out, err = NVCC_compiler.try_flags(
-                flag_list=params, preambule=preambule, body=body,
-                try_run=False, output=True)
-            dnn_available.avail = comp
-            if not dnn_available.avail:
-                dnn_available.msg = (
-                    "Theano can not compile with cuDNN. We got this error:\n" +
-                    str(err))
-            else:
-                # If we can compile, check that we can import and run.
-                v = version()
-                if isinstance(v, tuple) and v[0] != v[1]:
-                    dnn_available.avail = False
-                    dnn_available.msg = ("Mixed dnn version. The header is"
-                                         " from one version, but we link with"
-                                         " a different version %s" % str(v))
-                    raise RuntimeError(dnn_available.msg)
-                if v == -1 or v[0] < 3007:
-                    # 3007 is the final release of cudnn v3
-                    dnn_available.avail = False
-                    dnn_available.msg = (
-                        "You have an old release of CuDNN (or a release "
-                        "candidate) that isn't supported.  Please update to "
-                        "at least v3 final version.")
-                    raise RuntimeError(dnn_available.msg)
-    if config.dnn.enabled == "True":
-        if not dnn_available.avail:
-            raise RuntimeError(
-                "You enabled CuDNN, but we aren't able to use it: %s" %
-                dnn_available.msg)
-    return dnn_available.avail
-dnn_available.avail = None
-dnn_available.msg = None
 def c_set_tensor4d(var, desc, err, fail):
    return """
 {
@@ -170,67 +92,6 @@ class DnnBase(GpuOp, COp):
        return ['cudnn']
-class DnnVersion(GpuOp):
-    def c_compiler(self):
-        return NVCC_compiler
-    def c_headers(self):
-        return ['cudnn.h']
-    def c_libraries(self):
-        return ['cudnn']
-    def c_support_code(self):
-        return """
-#if PY_MAJOR_VERSION >= 3
-#define PyInt_FromLong PyLong_FromLong
-#endif
-"""
-    def make_node(self):
-        return Apply(self, [], [Generic()()])
-    def c_code(self, node, name, inputs, outputs, sub):
-        o = outputs[0]
-        return """
-        #if defined(CUDNN_VERSION)
-        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
-        #else
-        %(o)s = PyInt_FromLong(-1);
-        #endif
-        """ % locals()
-    def do_constant_folding(self, node):
-        # Needed as we do not want to cache this information.
-        return False
-    def c_code_cache_version(self):
-        # Not needed, but make it clear that we do not want to cache this.
-        return None
-def version():
-    """Return the current cuDNN version we compile with.
-    This returns a tuple with the header version and the library
-    version we link with. For older cudnn version without version
-    information, we return -1.
-    """
-    if not dnn_available():
-        raise Exception(
-            "We can't determine the cudnn version as it is not available",
-            dnn_available.msg)
-    if version.v is None:
-        f = theano.function([], DnnVersion()(),
-                            theano.Mode(optimizer=None),
-                            profile=False)
-        version.v = f()
-    return version.v
-version.v = None
 class GpuDnnConvDesc(GpuOp):
    """
    This Op builds a convolution descriptor for use in the other

--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
 from __future__ import print_function
-import sys
 import logging
+import sys
+import warnings
 import theano
 from theano.configparser import config, AddConfigVar, BoolParam
@@ -64,8 +65,25 @@ def init_dev(dev, name=None):
    reg_context(name, context)
    pygpu_activated = True
    if config.print_active_device:
-        print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
+        warn = None
+        cudnn_version = ""
+        if dev.startswith('cuda'):
+            cudnn_version = " (CuDNN not available)"
+            try:
+                cudnn_version = dnn.version()
+                # 4100 should not print warning with cudnn 4 final.
+                if cudnn_version > 4100:
+                    warn = ("Your CuDNN version is more recent than Theano."
+                            " If you see problems, try updating Theano or"
+                            " downgrading CuDNN to version 4.")
+                cudnn_version = " (CuDNN version %s)" % cudnn_version
+            except Exception:
+                pass
+        print("Mapped name %s to device %s: %s%s" % (
+            name, dev, context.devname, cudnn_version),
              file=sys.stderr)
+        if warn:
+            warnings.warn(warn)
 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}

--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -32,6 +32,10 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
 from .opt_util import alpha_merge, output_merge, inplace_allocempty
+def raise_no_cudnn(msg="CuDNN is required for convolution and pooling"):
+    raise RuntimeError(msg)
 def _dnn_check_compile():
    preambule = """
 #include <stdio.h>
@@ -211,16 +215,22 @@ class DnnVersion(Op):
        return None
-def version():
+def version(raises=True):
    """
    Return the current cuDNN version we link with.
    This also does a check that the header version matches the runtime version.
+    :raises: If True, raise an exception if CuDNN is not present or badly installed.
+        Otherwise, return -1.
    """
    if not dnn_present():
+        if raises:
            raise Exception(
                "We can't determine the cudnn version as it is not available",
                dnn_available.msg)
+        else:
+            return -1
    if version.v is None:
        f = theano.function([], DnnVersion()(),
@@ -1200,7 +1210,7 @@ class GpuDnnSoftmaxBase(DnnBase):
        DnnBase.__init__(self, [self.file], self.c_func)
        assert(algo in ('fast', 'accurate', 'log'))
-        if algo == 'log' and version() < 3000:
+        if algo == 'log' and version(raises=False) < 3000:
            raise RuntimeError("Need CuDNN v3 for log-softmax")
        self.algo = algo
@@ -1302,10 +1312,12 @@ def local_abstractconv_cudnn(node):
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]
-    if (not isinstance(inp1.type, GpuArrayType) or
+    if not isinstance(inp1.type, GpuArrayType):
-            not dnn_available(inp1.type.context_name)):
        return None
+    if not dnn_available(inp1.type.context_name):
+        raise_no_cudnn()
    if node.op.filter_flip:
        conv_mode = 'conv'
    else:
@@ -1404,7 +1416,7 @@ def local_dnn_convi_output_merge(node, *inputs):
 @op_lifter([Pool])
 def local_pool_dnn_alternative(node, ctx_name):
    if not dnn_available(ctx_name):
-        return
+        raise_no_cudnn()
    if not node.op.ignore_border:
        return
    img, = node.inputs
@@ -1420,7 +1432,7 @@ def local_pool_dnn_alternative(node, ctx_name):
 @op_lifter([MaxPoolGrad])
 def local_pool_dnn_grad_stride(node, ctx_name):
    if not dnn_available(ctx_name):
-        return
+        raise_no_cudnn()
    if not node.op.ignore_border:
        return
    inp, out, out_grad = node.inputs
@@ -1443,7 +1455,7 @@ def local_pool_dnn_grad_stride(node, ctx_name):
 @op_lifter([AveragePoolGrad])
 def local_avg_pool_dnn_grad_stride(node, ctx_name):
    if not dnn_available(ctx_name):
-        return
+        raise_no_cudnn()
    if not node.op.ignore_border:
        return
    inp, out_grad = node.inputs
@@ -1468,7 +1480,7 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
 def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
        if not dnn_available(node.outputs[0].type.context_name):
-            return
+            raise_no_cudnn()
        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax('accurate', 'channel')(ins)
@@ -1479,15 +1491,15 @@ def local_softmax_dnn(node):
 @register_opt('cudnn')
 @local_optimizer([GpuElemwise])
 def local_log_softmax_dnn(node):
-    if version() < 3000:
-        # No log-softmax before cudnn v3
-        return
    # This looks for GpuDnnSoftmax so we know that we have cudnn.
    if (isinstance(node.op, GpuElemwise) and
            isinstance(node.op.scalar_op, Log) and
            node.inputs[0].owner and
            isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
            len(node.inputs[0].clients) == 1):
+        if version(raises=False) < 3000:
+            # No log-softmax before cudnn v3
+            raise_no_cudnn("Need CuDNN v3 for LogSoftmax")
        softmax_node = node.inputs[0].owner
        new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
        return [new_softmax(softmax_node.inputs[0])]
@@ -1496,14 +1508,14 @@ def local_log_softmax_dnn(node):
 @register_opt('cudnn')
 @op_lifter([LogSoftmax])
 def local_logsoftmax_to_dnn(node, ctx_name):
-    if not dnn_available(ctx_name) or version() < 3000:
-        # No log-softmax before cudnn v3
-        return
    # Transform the input in the format expected by GpuDnnSoftmax
    inp = node.inputs[0]
    if inp.ndim != 2:
        return
+    if not dnn_available(ctx_name) or version(raises=False) < 3000:
+        # No log-softmax before cudnn v3
+        raise_no_cudnn("Need CuDNN v3 for LogSoftmax")
    inp = inp.dimshuffle(0, 1, 'x', 'x')
    inp.tag.context_name = ctx_name
@@ -1534,7 +1546,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
 @op_lifter([SoftmaxGrad])
 def local_softmax_dnn_grad(node, ctx_name):
    if not dnn_available(ctx_name):
-        return
+        raise_no_cudnn("CuDNN needed for SoftmaxGrad")
    ins = []
    for n in node.inputs:
        n = as_gpuarray_variable(n, ctx_name)