提交 e9425be8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4000 from nouiz/cudnn_version

Cudnn version, print it, warn when too new, make mandatory in the new back-end.
...@@ -278,8 +278,8 @@ class GpuOp(theano.gof.Op): ...@@ -278,8 +278,8 @@ class GpuOp(theano.gof.Op):
""" """
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
if theano.sandbox.cuda.use.device_number is None: if use.device_number is None:
theano.sandbox.cuda.use("gpu", use("gpu",
force=True, force=True,
default_to_move_computation_to_gpu=False, default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False, move_shared_float32_to_gpu=False,
...@@ -299,6 +299,146 @@ from theano.sandbox.cuda.var import (CudaNdarrayVariable, ...@@ -299,6 +299,146 @@ from theano.sandbox.cuda.var import (CudaNdarrayVariable,
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
def dnn_available():
if config.dnn.enabled == "False":
dnn_available.avail = False
dnn_available.msg = "disabled by dnn.enabled flag"
if dnn_available.avail is None and not cuda_available:
dnn_available.msg = "CUDA not available"
dnn_available.avail = False
elif dnn_available.avail is None:
dev = active_device_number()
if device_properties(dev)['major'] < 3:
dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False
else:
preambule = """
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
body = """
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
config.nvcc.compiler_bindir])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = nvcc_compiler.NVCC_compiler.try_flags(
flag_list=params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
if not dnn_available.avail:
dnn_available.msg = (
"Theano can not compile with cuDNN. We got this error:\n" +
str(err))
else:
# If we can compile, check that we can import and run.
v = dnn_version()
if isinstance(v, tuple) and v[0] != v[1]:
dnn_available.avail = False
dnn_available.msg = ("Mixed dnn version. The header is"
" from one version, but we link with"
" a different version %s" % str(v))
raise RuntimeError(dnn_available.msg)
if v == -1 or v[0] < 3007:
# 3007 is the final release of cudnn v3
dnn_available.avail = False
dnn_available.msg = (
"You have an old release of CuDNN (or a release "
"candidate) that isn't supported. Please update to "
"at least v3 final version.")
raise RuntimeError(dnn_available.msg)
if config.dnn.enabled == "True":
if not dnn_available.avail:
raise RuntimeError(
"You enabled CuDNN, but we aren't able to use it: %s" %
dnn_available.msg)
return dnn_available.avail
dnn_available.avail = None
dnn_available.msg = None
class DnnVersion(GpuOp):
def c_compiler(self):
return nvcc_compiler.NVCC_compiler
def c_headers(self):
return ['cudnn.h']
def c_libraries(self):
return ['cudnn']
def c_support_code(self):
return """
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
"""
def make_node(self):
return theano.gof.Apply(self, [], [theano.gof.Generic()()])
def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0]
return """
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""" % locals()
def do_constant_folding(self, node):
# Needed as we do not want to cache this information.
return False
def c_code_cache_version(self):
# Not needed, but make it clear that we do not want to cache this.
return None
def dnn_version():
"""Return the current cuDNN version we compile with.
This returns a tuple with the header version and the library
version we link with. For older cudnn version without version
information, we return -1.
"""
if not dnn_available():
raise Exception(
"We can't determine the cudnn version as it is not available",
dnn_available.msg)
if dnn_version.v is None:
f = theano.function([], DnnVersion()(),
theano.Mode(optimizer=None),
profile=False)
dnn_version.v = f()
return dnn_version.v
dnn_version.v = None
if cuda_available: if cuda_available:
# check if their is an old cuda_ndarray that was loading instead of the one # check if their is an old cuda_ndarray that was loading instead of the one
# we compiled! # we compiled!
...@@ -451,9 +591,36 @@ def use(device, ...@@ -451,9 +591,36 @@ def use(device,
" this property") " this property")
if config.print_active_device: if config.print_active_device:
cnmem_enabled = "enabled" if config.lib.cnmem else "disabled" if config.lib.cnmem:
print("Using gpu device %d: %s (CNMeM is %s)" % ( if config.lib.cnmem > 1:
active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr) cnmem_enabled = "enabled with initial size: %d MB" % config.lib.cnmem
else:
cnmem = min(config.lib.cnmem, 0.98)
cnmem_enabled = "enabled with initial size: %.2f%% of memory" % cnmem
else:
cnmem_enabled = "disabled"
cudnn_version = "not available"
warn = None
try:
(hdr_v, runtime_v) = dnn_version()
cudnn_version = runtime_v
# 4100 should not print warning with cudnn 4 final.
if cudnn_version > 4100:
warn = ("Your CuDNN version is more recent then Theano."
" If you see problems, try updating Theano or"
" downgrading CuDNN to version 4.")
except Exception:
pass
print("Using gpu device %d: %s (CNMeM is %s, CuDNN %s)" % (
active_device_number(),
active_device_name(),
cnmem_enabled,
cudnn_version,),
file=sys.stderr)
if warn:
import warnings
warnings.warn(warn)
if device_properties(use.device_number)['regsPerBlock'] < 16384: if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places # We will try to use too much register per bloc at many places
# when there is only 8k register per multi-processor. # when there is only 8k register per multi-processor.
......
...@@ -7,7 +7,7 @@ from theano import Apply, tensor, config, Variable ...@@ -7,7 +7,7 @@ from theano import Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log from theano.scalar import as_scalar, constant, Log
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
...@@ -16,7 +16,8 @@ from theano.tensor.signal.pool import ( ...@@ -16,7 +16,8 @@ from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp, dnn_available
from theano.sandbox.cuda import dnn_version as version
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, host_from_gpu,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
...@@ -35,85 +36,6 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -35,85 +36,6 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradInputs) AbstractConv2d_gradInputs)
def dnn_available():
if config.dnn.enabled == "False":
dnn_available.avail = False
dnn_available.msg = "disabled by dnn.enabled flag"
if dnn_available.avail is None and not theano.sandbox.cuda.cuda_available:
dnn_available.msg = "CUDA not available"
dnn_available.avail = False
elif dnn_available.avail is None:
dev = theano.sandbox.cuda.active_device_number()
if theano.sandbox.cuda.device_properties(dev)['major'] < 3:
dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False
else:
preambule = """
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
body = """
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
config.nvcc.compiler_bindir])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags(
flag_list=params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
if not dnn_available.avail:
dnn_available.msg = (
"Theano can not compile with cuDNN. We got this error:\n" +
str(err))
else:
# If we can compile, check that we can import and run.
v = version()
if isinstance(v, tuple) and v[0] != v[1]:
dnn_available.avail = False
dnn_available.msg = ("Mixed dnn version. The header is"
" from one version, but we link with"
" a different version %s" % str(v))
raise RuntimeError(dnn_available.msg)
if v == -1 or v[0] < 3007:
# 3007 is the final release of cudnn v3
dnn_available.avail = False
dnn_available.msg = (
"You have an old release of CuDNN (or a release "
"candidate) that isn't supported. Please update to "
"at least v3 final version.")
raise RuntimeError(dnn_available.msg)
if config.dnn.enabled == "True":
if not dnn_available.avail:
raise RuntimeError(
"You enabled CuDNN, but we aren't able to use it: %s" %
dnn_available.msg)
return dnn_available.avail
dnn_available.avail = None
dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail): def c_set_tensor4d(var, desc, err, fail):
return """ return """
{ {
...@@ -170,67 +92,6 @@ class DnnBase(GpuOp, COp): ...@@ -170,67 +92,6 @@ class DnnBase(GpuOp, COp):
return ['cudnn'] return ['cudnn']
class DnnVersion(GpuOp):
def c_compiler(self):
return NVCC_compiler
def c_headers(self):
return ['cudnn.h']
def c_libraries(self):
return ['cudnn']
def c_support_code(self):
return """
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
"""
def make_node(self):
return Apply(self, [], [Generic()()])
def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0]
return """
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""" % locals()
def do_constant_folding(self, node):
# Needed as we do not want to cache this information.
return False
def c_code_cache_version(self):
# Not needed, but make it clear that we do not want to cache this.
return None
def version():
"""Return the current cuDNN version we compile with.
This returns a tuple with the header version and the library
version we link with. For older cudnn version without version
information, we return -1.
"""
if not dnn_available():
raise Exception(
"We can't determine the cudnn version as it is not available",
dnn_available.msg)
if version.v is None:
f = theano.function([], DnnVersion()(),
theano.Mode(optimizer=None),
profile=False)
version.v = f()
return version.v
version.v = None
class GpuDnnConvDesc(GpuOp): class GpuDnnConvDesc(GpuOp):
""" """
This Op builds a convolution descriptor for use in the other This Op builds a convolution descriptor for use in the other
......
from __future__ import print_function from __future__ import print_function
import sys
import logging import logging
import sys
import warnings
import theano import theano
from theano.configparser import config, AddConfigVar, BoolParam from theano.configparser import config, AddConfigVar, BoolParam
...@@ -64,8 +65,25 @@ def init_dev(dev, name=None): ...@@ -64,8 +65,25 @@ def init_dev(dev, name=None):
reg_context(name, context) reg_context(name, context)
pygpu_activated = True pygpu_activated = True
if config.print_active_device: if config.print_active_device:
print("Mapped name %s to device %s: %s" % (name, dev, context.devname), warn = None
cudnn_version = ""
if dev.startswith('cuda'):
cudnn_version = " (CuDNN not available)"
try:
cudnn_version = dnn.version()
# 4100 should not print warning with cudnn 4 final.
if cudnn_version > 4100:
warn = ("Your CuDNN version is more recent than Theano."
" If you see problems, try updating Theano or"
" downgrading CuDNN to version 4.")
cudnn_version = " (CuDNN version %s)" % cudnn_version
except Exception:
pass
print("Mapped name %s to device %s: %s%s" % (
name, dev, context.devname, cudnn_version),
file=sys.stderr) file=sys.stderr)
if warn:
warnings.warn(warn)
# This maps things like 'cuda0' to the context object on that device. # This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {} init_dev.devmap = {}
......
...@@ -32,6 +32,10 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter ...@@ -32,6 +32,10 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge, inplace_allocempty from .opt_util import alpha_merge, output_merge, inplace_allocempty
def raise_no_cudnn(msg="CuDNN is required for convolution and pooling"):
raise RuntimeError(msg)
def _dnn_check_compile(): def _dnn_check_compile():
preambule = """ preambule = """
#include <stdio.h> #include <stdio.h>
...@@ -211,16 +215,22 @@ class DnnVersion(Op): ...@@ -211,16 +215,22 @@ class DnnVersion(Op):
return None return None
def version(): def version(raises=True):
""" """
Return the current cuDNN version we link with. Return the current cuDNN version we link with.
This also does a check that the header version matches the runtime version. This also does a check that the header version matches the runtime version.
:raises: If True, raise an exception if CuDNN is not present or badly installed.
Otherwise, return -1.
""" """
if not dnn_present(): if not dnn_present():
if raises:
raise Exception( raise Exception(
"We can't determine the cudnn version as it is not available", "We can't determine the cudnn version as it is not available",
dnn_available.msg) dnn_available.msg)
else:
return -1
if version.v is None: if version.v is None:
f = theano.function([], DnnVersion()(), f = theano.function([], DnnVersion()(),
...@@ -1200,7 +1210,7 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1200,7 +1210,7 @@ class GpuDnnSoftmaxBase(DnnBase):
DnnBase.__init__(self, [self.file], self.c_func) DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log')) assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version() < 3000: if algo == 'log' and version(raises=False) < 3000:
raise RuntimeError("Need CuDNN v3 for log-softmax") raise RuntimeError("Need CuDNN v3 for log-softmax")
self.algo = algo self.algo = algo
...@@ -1302,10 +1312,12 @@ def local_abstractconv_cudnn(node): ...@@ -1302,10 +1312,12 @@ def local_abstractconv_cudnn(node):
inp1 = node.inputs[0] inp1 = node.inputs[0]
inp2 = node.inputs[1] inp2 = node.inputs[1]
if (not isinstance(inp1.type, GpuArrayType) or if not isinstance(inp1.type, GpuArrayType):
not dnn_available(inp1.type.context_name)):
return None return None
if not dnn_available(inp1.type.context_name):
raise_no_cudnn()
if node.op.filter_flip: if node.op.filter_flip:
conv_mode = 'conv' conv_mode = 'conv'
else: else:
...@@ -1404,7 +1416,7 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1404,7 +1416,7 @@ def local_dnn_convi_output_merge(node, *inputs):
@op_lifter([Pool]) @op_lifter([Pool])
def local_pool_dnn_alternative(node, ctx_name): def local_pool_dnn_alternative(node, ctx_name):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
return raise_no_cudnn()
if not node.op.ignore_border: if not node.op.ignore_border:
return return
img, = node.inputs img, = node.inputs
...@@ -1420,7 +1432,7 @@ def local_pool_dnn_alternative(node, ctx_name): ...@@ -1420,7 +1432,7 @@ def local_pool_dnn_alternative(node, ctx_name):
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
def local_pool_dnn_grad_stride(node, ctx_name): def local_pool_dnn_grad_stride(node, ctx_name):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
return raise_no_cudnn()
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, out_grad = node.inputs inp, out, out_grad = node.inputs
...@@ -1443,7 +1455,7 @@ def local_pool_dnn_grad_stride(node, ctx_name): ...@@ -1443,7 +1455,7 @@ def local_pool_dnn_grad_stride(node, ctx_name):
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
def local_avg_pool_dnn_grad_stride(node, ctx_name): def local_avg_pool_dnn_grad_stride(node, ctx_name):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
return raise_no_cudnn()
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out_grad = node.inputs inp, out_grad = node.inputs
...@@ -1468,7 +1480,7 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name): ...@@ -1468,7 +1480,7 @@ def local_avg_pool_dnn_grad_stride(node, ctx_name):
def local_softmax_dnn(node): def local_softmax_dnn(node):
if isinstance(node.op, GpuSoftmax): if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name): if not dnn_available(node.outputs[0].type.context_name):
return raise_no_cudnn()
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x') ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
ins = gpu_contiguous(ins) ins = gpu_contiguous(ins)
out = GpuDnnSoftmax('accurate', 'channel')(ins) out = GpuDnnSoftmax('accurate', 'channel')(ins)
...@@ -1479,15 +1491,15 @@ def local_softmax_dnn(node): ...@@ -1479,15 +1491,15 @@ def local_softmax_dnn(node):
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
def local_log_softmax_dnn(node): def local_log_softmax_dnn(node):
if version() < 3000:
# No log-softmax before cudnn v3
return
# This looks for GpuDnnSoftmax so we know that we have cudnn. # This looks for GpuDnnSoftmax so we know that we have cudnn.
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
isinstance(node.op.scalar_op, Log) and isinstance(node.op.scalar_op, Log) and
node.inputs[0].owner and node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
len(node.inputs[0].clients) == 1): len(node.inputs[0].clients) == 1):
if version(raises=False) < 3000:
# No log-softmax before cudnn v3
raise_no_cudnn("Need CuDNN v3 for LogSoftmax")
softmax_node = node.inputs[0].owner softmax_node = node.inputs[0].owner
new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode) new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
return [new_softmax(softmax_node.inputs[0])] return [new_softmax(softmax_node.inputs[0])]
...@@ -1496,14 +1508,14 @@ def local_log_softmax_dnn(node): ...@@ -1496,14 +1508,14 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn') @register_opt('cudnn')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
def local_logsoftmax_to_dnn(node, ctx_name): def local_logsoftmax_to_dnn(node, ctx_name):
if not dnn_available(ctx_name) or version() < 3000:
# No log-softmax before cudnn v3
return
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0] inp = node.inputs[0]
if inp.ndim != 2: if inp.ndim != 2:
return return
if not dnn_available(ctx_name) or version(raises=False) < 3000:
# No log-softmax before cudnn v3
raise_no_cudnn("Need CuDNN v3 for LogSoftmax")
inp = inp.dimshuffle(0, 1, 'x', 'x') inp = inp.dimshuffle(0, 1, 'x', 'x')
inp.tag.context_name = ctx_name inp.tag.context_name = ctx_name
...@@ -1534,7 +1546,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') ...@@ -1534,7 +1546,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node, ctx_name): def local_softmax_dnn_grad(node, ctx_name):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
return raise_no_cudnn("CuDNN needed for SoftmaxGrad")
ins = [] ins = []
for n in node.inputs: for n in node.inputs:
n = as_gpuarray_variable(n, ctx_name) n = as_gpuarray_variable(n, ctx_name)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论