提交 c6ffa460 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5357 from nouiz/abergeron-dnn_mem

Select the dnn convolution algorithm using actually available memory
...@@ -8,7 +8,7 @@ set -x ...@@ -8,7 +8,7 @@ set -x
# Anaconda python # Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA # CUDA
export PATH=/usr/local/cuda/bin:$PATH export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
...@@ -38,13 +38,13 @@ echo "===== Testing gpuarray backend" ...@@ -38,13 +38,13 @@ echo "===== Testing gpuarray backend"
GPUARRAY_CONFIG="Release" GPUARRAY_CONFIG="Release"
DEVICE=cuda0 DEVICE=cuda0
LIBDIR=~/tmp/local LIBDIR=${WORKSPACE}/local
# Make fresh clones of libgpuarray (with no history since we don't need it) # Make fresh clones of libgpuarray (with no history since we don't need it)
rm -rf libgpuarray rm -rf libgpuarray
git clone --depth 1 "https://github.com/Theano/libgpuarray.git" git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
# Clean up previous installs (to make sure no old files are left) # Clean up previous installs (to make sure no old files are left)
rm -rf $LIBDIR rm -rf $LIBDIR
mkdir $LIBDIR mkdir $LIBDIR
...@@ -52,25 +52,25 @@ mkdir $LIBDIR ...@@ -52,25 +52,25 @@ mkdir $LIBDIR
mkdir libgpuarray/build mkdir libgpuarray/build
(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make) (cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
# Finally install # Finally install
(cd libgpuarray/build && make install) (cd libgpuarray/build && make install)
# Export paths # Export paths
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
export CPATH=$CPATH:$LIBDIR/include export CPATH=$CPATH:$LIBDIR/include
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
# Build the pygpu modules # Build the pygpu modules
(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib) (cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
ls $LIBDIR ls $LIBDIR
mkdir $LIBDIR/lib/python mkdir $LIBDIR/lib/python
export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
# Then install # Then install
(cd libgpuarray && python setup.py install --home=$LIBDIR) (cd libgpuarray && python setup.py install --home=$LIBDIR)
# Testing theano (the gpuarray parts) python -c 'import pygpu; print(pygpu.__file__)'
# Testing theano (the gpuarray parts)
THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \ THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \ theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \ theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
......
...@@ -27,7 +27,7 @@ except ImportError: ...@@ -27,7 +27,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined, _get_props) reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction
...@@ -46,63 +46,66 @@ def init_dev(dev, name=None): ...@@ -46,63 +46,66 @@ def init_dev(dev, name=None):
if not config.cxx: if not config.cxx:
raise RuntimeError("The new gpu-backend need a c++ compiler.") raise RuntimeError("The new gpu-backend need a c++ compiler.")
if (pygpu.version.major, pygpu.version.minor) < (0, 6): if (pygpu.version.major, pygpu.version.minor) < (0, 6):
raise ValueError("Your installed version of pygpu is too old, please upgrade to 0.6 or later") raise ValueError(
"Your installed version of pygpu is too old, please upgrade to 0.6 or later")
# This is for the C headers API
if pygpu.gpuarray.api_version()[0] < 0:
raise ValueError(
"Your installed libgpuarray is too old, please update")
if dev not in init_dev.devmap: if dev not in init_dev.devmap:
ctx = pygpu.init(dev, context = pygpu.init(
disable_alloc_cache=config.gpuarray.preallocate < 0, dev,
single_stream=config.gpuarray.single_stream, disable_alloc_cache=config.gpuarray.preallocate < 0,
sched=config.gpuarray.sched) single_stream=config.gpuarray.single_stream,
init_dev.devmap[dev] = ctx sched=config.gpuarray.sched)
context.dev = dev
init_dev.devmap[dev] = context
reg_context(name, context)
if dev.startswith('cuda'):
avail = dnn.dnn_available(name)
if avail:
context.cudnn_handle = dnn._make_handle(context)
if config.print_active_device:
if avail:
print("Using cuDNN version %d on context %s" % (dnn.version(), name),
file=sys.stderr)
else:
print("Can not use cuDNN on context %s: %s" % (name, dnn.dnn_available.msg),
file=sys.stderr)
if config.gpuarray.preallocate < 0: if config.gpuarray.preallocate < 0:
print("Disabling allocation cache on %s" % (dev,)) print("Disabling allocation cache on %s" % (dev,))
elif config.gpuarray.preallocate > 0: elif config.gpuarray.preallocate > 0:
MB = (1024 * 1024) MB = (1024 * 1024)
if config.gpuarray.preallocate <= 1: if config.gpuarray.preallocate <= 1:
gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem gmem = min(config.gpuarray.preallocate, 0.95) * context.total_gmem
else: else:
gmem = config.gpuarray.preallocate * MB gmem = config.gpuarray.preallocate * MB
if gmem > context.free_gmem - 50 * MB:
print(
"WARNING: Preallocating too much memory can prevent cudnn and cublas from working properly")
# This will allocate and immediatly free an object of size gmem # This will allocate and immediatly free an object of size gmem
# which will reserve that amount of memory on the GPU. # which will reserve that amount of memory on the GPU.
pygpu.empty((gmem,), dtype='int8', context=ctx) pygpu.empty((gmem,), dtype='int8', context=context)
if config.print_active_device: if config.print_active_device:
print("Preallocating %d/%d Mb (%f) on %s" % print("Preallocating %d/%d Mb (%f) on %s" %
(gmem//MB, ctx.total_gmem//MB, gmem/ctx.total_gmem, dev), (gmem//MB, context.total_gmem//MB,
gmem/context.total_gmem, dev),
file=sys.stderr) file=sys.stderr)
context = init_dev.devmap[dev] else:
context = init_dev.devmap[dev]
# This will map the context name to the real context object. # This will map the context name to the real context object.
reg_context(name, context)
if config.print_active_device: if config.print_active_device:
try: try:
pcibusid = context.pcibusid pcibusid = '(' + context.pcibusid + ')'
except pygpu.gpuarray.UnsupportedException: except pygpu.gpuarray.UnsupportedException:
pcibusid = '(unsupported for device %s)' % dev pcibusid = ''
except Exception:
warnings.warn('Unable to get PCI Bus ID. Please consider updating libgpuarray and pygpu.')
pcibusid = 'unknown'
print("Mapped name %s to device %s: %s" % print("Mapped name %s to device %s: %s %s" %
(name, dev, context.devname), (name, dev, context.devname, pcibusid),
file=sys.stderr) file=sys.stderr)
print("PCI Bus ID:", pcibusid, file=sys.stderr)
pygpu_activated = True pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'):
if 'cudnn_version' not in ctx_props:
try:
ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device. # This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {} init_dev.devmap = {}
...@@ -119,7 +122,8 @@ if pygpu: ...@@ -119,7 +122,8 @@ if pygpu:
elif (config.init_gpu_device.startswith('cuda') or elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')): config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu': if config.device != 'cpu':
raise ValueError('you must set device=cpu to use init_gpu_device.') raise ValueError(
'you must set device=cpu to use init_gpu_device.')
if config.contexts != '': if config.contexts != '':
print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.") print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
init_dev(config.init_gpu_device) init_dev(config.init_gpu_device)
...@@ -147,4 +151,5 @@ else: ...@@ -147,4 +151,5 @@ else:
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.device.startswith('cuda') or config.device.startswith('cuda') or
config.contexts != ''): config.contexts != ''):
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", exc_info=True) error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
exc_info=True)
...@@ -30,7 +30,7 @@ from theano.tensor.signal.pool import ( ...@@ -30,7 +30,7 @@ from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
get_prop, set_prop, GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType)
...@@ -59,12 +59,12 @@ def _dnn_lib(): ...@@ -59,12 +59,12 @@ def _dnn_lib():
lib_name = ctypes.util.find_library('cudnn') lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32': if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported. # Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll', 'cudnn64_4.dll']: for name in ['cudnn64_5.dll']:
lib_name = ctypes.util.find_library(name) lib_name = ctypes.util.find_library(name)
if lib_name: if lib_name:
break break
if lib_name is None: if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])') raise RuntimeError('Could not find cudnn library (looked for v5[.1])')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name) _dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)] cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
...@@ -109,10 +109,16 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -109,10 +109,16 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
""" """
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)] params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
path_wrapper = "\"" if os.name == 'nt' else ""
params = ["-l", "cudnn"]
params.extend(['-I%s%s%s' % (path_wrapper, os.path.dirname(__file__), path_wrapper)])
if config.dnn.include_path: if config.dnn.include_path:
params.append("-I" + config.dnn.include_path) params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
if config.dnn.library_path: if config.dnn.library_path:
params.append("-L" + config.dnn.library_path) params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
'%s%s%s' % (path_wrapper, config.nvcc.compiler_bindir, path_wrapper)])
# Do not run here the test program. It would run on the # Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
...@@ -129,9 +135,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -129,9 +135,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
def _dnn_check_version(): def _dnn_check_version():
v = version() v = version()
if v < 4007: if v < 5000:
return False, "cuDNN version is too old. Update to v5, was %d." % v return False, "cuDNN version is too old. Update to v5, was %d." % v
# 5200 should not print warning with cudnn 5.1 final.
if version >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
return True, None return True, None
...@@ -209,14 +220,13 @@ class DnnBase(COp): ...@@ -209,14 +220,13 @@ class DnnBase(COp):
return node.outputs[0].type.context_name return node.outputs[0].type.context_name
def get_params(self, node): def get_params(self, node):
try: ctx_name = self.dnn_context(node)
return get_prop(self.dnn_context(node), 'cudnn_handle_param') ctx = get_context(ctx_name)
except KeyError: if not hasattr(ctx, 'cudnn_handle_param'):
pass ptr = ctx.cudnn_handle.value
ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value res = handle_type.make_value(ptr)
res = handle_type.make_value(ptr) ctx.cudnn_handle_param = res
set_prop(self.dnn_context(node), 'cudnn_handle_param', res) return ctx.cudnn_handle_param
return res
def __init__(self, files=None, c_func=None): def __init__(self, files=None, c_func=None):
if files is None: if files is None:
...@@ -301,7 +311,7 @@ def version(raises=True): ...@@ -301,7 +311,7 @@ def version(raises=True):
""" """
if not dnn_present(): if not dnn_present():
if raises: if raises:
raise Exception( raise RuntimeError(
"We can't determine the cudnn version as it is not available", "We can't determine the cudnn version as it is not available",
dnn_available.msg) dnn_available.msg)
else: else:
...@@ -500,10 +510,6 @@ class GpuDnnConv(DnnBase): ...@@ -500,10 +510,6 @@ class GpuDnnConv(DnnBase):
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
if version() < 5000 and self.algo == 'winograd':
raise RuntimeError("cuDNN winograd convolution requires "
"cuDNN v5 or more recent")
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling', assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change', 'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'] 'time_once', 'time_on_shape_change']
...@@ -524,9 +530,9 @@ class GpuDnnConv(DnnBase): ...@@ -524,9 +530,9 @@ class GpuDnnConv(DnnBase):
defs.append(('CONV_INPLACE', '1')) defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if self.algo == 'none': # 3d (at least in v4) if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.algo == 'small': # 3d (at least in v4) elif self.algo == 'small': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large': elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
...@@ -534,10 +540,9 @@ class GpuDnnConv(DnnBase): ...@@ -534,10 +540,9 @@ class GpuDnnConv(DnnBase):
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft': elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d (not in v4, in v5) elif self.algo == 'fft_tiling': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
elif self.algo == 'winograd': elif self.algo == 'winograd':
# need v5
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
defs.append(('CONV_ALGO', alg)) defs.append(('CONV_ALGO', alg))
...@@ -571,10 +576,6 @@ class GpuDnnConv(DnnBase): ...@@ -571,10 +576,6 @@ class GpuDnnConv(DnnBase):
if img.type.ndim == 5 and self.algo in ['large', 'fft']: if img.type.ndim == 5 and self.algo in ['large', 'fft']:
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
if (img.type.ndim == 5 and
self.algo in ['fft_tiling'] and
version() < 5000):
raise ValueError("3d convolution algo fft_tiling need cudnn v5")
if (not isinstance(desc.type, CDataType) or if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'): desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
...@@ -700,13 +701,13 @@ class GpuDnnConvGradW(DnnBase): ...@@ -700,13 +701,13 @@ class GpuDnnConvGradW(DnnBase):
defs.append(('CONV_INPLACE', '1')) defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0' alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none': # 3d in at least v4 if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0' alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic': if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1' alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft': if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT' alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo == 'small': # 3d in at least v4 if self.algo == 'small': # 3d
# non-deterministic, small workspace # non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3' alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
if self.algo in ['guess_once', 'guess_on_shape_change', if self.algo in ['guess_once', 'guess_on_shape_change',
...@@ -793,10 +794,6 @@ class GpuDnnConvGradI(DnnBase): ...@@ -793,10 +794,6 @@ class GpuDnnConvGradI(DnnBase):
algo = config.dnn.conv.algo_bwd_data algo = config.dnn.conv.algo_bwd_data
self.algo = algo self.algo = algo
if version() < 5000 and self.algo == 'winograd':
raise RuntimeError("cuDNN's winograd convolution requires cuDNN "
"v5 or more recent")
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling', assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change', 'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'] 'time_once', 'time_on_shape_change']
...@@ -832,17 +829,16 @@ class GpuDnnConvGradI(DnnBase): ...@@ -832,17 +829,16 @@ class GpuDnnConvGradI(DnnBase):
defs.append(('CONV_INPLACE', '1')) defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none': # 3d at least v4 if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
elif self.algo == 'deterministic': # 3d at least v4 elif self.algo == 'deterministic': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
elif self.algo == 'fft': elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d not v4, since v5 elif self.algo == 'fft_tiling': # 3d
# big workspace but less than fft # big workspace but less than fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
elif self.algo == 'winograd': elif self.algo == 'winograd':
# need v5
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD' alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
if self.algo in ['guess_once', 'guess_on_shape_change', if self.algo in ['guess_once', 'guess_on_shape_change',
...@@ -877,10 +873,6 @@ class GpuDnnConvGradI(DnnBase): ...@@ -877,10 +873,6 @@ class GpuDnnConvGradI(DnnBase):
if kern.type.ndim == 5 and self.algo in ['fft']: if kern.type.ndim == 5 and self.algo in ['fft']:
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
if (kern.type.ndim == 5 and
self.algo == 'fft_tiling' and
version() < 5000):
raise ValueError("3d convolution algo fft_tiling need cudnn v5")
if (not isinstance(desc.type, CDataType) or if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'): desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
...@@ -1316,11 +1308,7 @@ class GpuDnnPoolDesc(Op): ...@@ -1316,11 +1308,7 @@ class GpuDnnPoolDesc(Op):
static const int pad[%(nd)d] = {%(pad)s}; static const int pad[%(nd)d] = {%(pad)s};
static const int str[%(nd)d] = {%(str)s}; static const int str[%(nd)d] = {%(str)s};
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str); err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str);
#else
err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, %(nd)d, win, pad, str);
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
...@@ -1664,8 +1652,6 @@ class GpuDnnBatchNorm(DnnBase): ...@@ -1664,8 +1652,6 @@ class GpuDnnBatchNorm(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
'dnn_batchnorm_op') 'dnn_batchnorm_op')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial')) assert (mode in ('per-activation', 'spatial'))
self.mode = mode self.mode = mode
...@@ -1724,8 +1710,6 @@ class GpuDnnBatchNormInference(DnnBase): ...@@ -1724,8 +1710,6 @@ class GpuDnnBatchNormInference(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op') 'dnn_batchnorm_op')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial')) assert (mode in ('per-activation', 'spatial'))
self.mode = mode self.mode = mode
...@@ -1788,8 +1772,6 @@ class GpuDnnBatchNormGrad(DnnBase): ...@@ -1788,8 +1772,6 @@ class GpuDnnBatchNormGrad(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'], DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
'dnn_batchnorm_grad') 'dnn_batchnorm_grad')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial')) assert (mode in ('per-activation', 'spatial'))
self.mode = mode self.mode = mode
......
...@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
if (nd < 3) if (nd < 3)
nd = 3; nd = 3;
#if CUDNN_VERSION >= 5000
err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims); err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
#else
err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
......
...@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME #ifdef CHOOSE_TIME
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm( gpudata *tmpmem;
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice); tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
algo = choice.algo; algo = choice.algo;
#else #else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
......
...@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME #ifdef CHOOSE_TIME
int count; int count;
cudnnConvolutionBwdDataAlgoPerf_t choice; cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardDataAlgorithm( tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice); APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
...@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = choice.algo; algo = choice.algo;
#else #else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
......
...@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif #endif
if (!reuse_algo) { if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME #ifdef CHOOSE_TIME
int count; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardFilterAlgorithm( tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice); APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = choice.algo; algo = choice.algo;
#else #else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
......
...@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
} }
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
...@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i)); s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
} }
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s); err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
...@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
[conv_modes[0]])), [conv_modes[0]])),
testcase_func_name=utt.custom_name_func) testcase_func_name=utt.custom_name_func)
def test_conv(self, algo, border_mode, conv_mode): def test_conv(self, algo, border_mode, conv_mode):
if algo == 'winograd' and dnn.version(raises=False) < 5000:
raise SkipTest(dnn.dnn_available.msg)
self._test_conv(T.tensor4('img'), self._test_conv(T.tensor4('img'),
T.tensor4('kerns'), T.tensor4('kerns'),
T.tensor4('out'), T.tensor4('out'),
...@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax):
def test_dnn_batchnorm_train(): def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
...@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train(): ...@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train():
def test_batchnorm_inference(): def test_batchnorm_inference():
if not dnn.dnn_available(test_ctx_name): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng() utt.seed_rng()
for mode in ('per-activation', 'spatial'): for mode in ('per-activation', 'spatial'):
......
...@@ -68,7 +68,6 @@ def reg_context(name, ctx): ...@@ -68,7 +68,6 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext): if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext") raise TypeError("context is not GpuContext")
_context_reg[name] = ctx _context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name): def get_context(name):
...@@ -97,26 +96,6 @@ def list_contexts(): ...@@ -97,26 +96,6 @@ def list_contexts():
""" """
return _context_reg.keys() return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method # Private method
def _name_for_ctx(ctx): def _name_for_ctx(ctx):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论