提交 c6ffa460 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5357 from nouiz/abergeron-dnn_mem

Select the dnn convolution algorithm using actually available memory
......@@ -8,7 +8,7 @@ set -x
# Anaconda python
export PATH=/usr/local/miniconda2/bin:$PATH
# CUDA
# CUDA
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
......@@ -38,13 +38,13 @@ echo "===== Testing gpuarray backend"
GPUARRAY_CONFIG="Release"
DEVICE=cuda0
LIBDIR=~/tmp/local
LIBDIR=${WORKSPACE}/local
# Make fresh clones of libgpuarray (with no history since we don't need it)
rm -rf libgpuarray
git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
# Clean up previous installs (to make sure no old files are left)
# Clean up previous installs (to make sure no old files are left)
rm -rf $LIBDIR
mkdir $LIBDIR
......@@ -52,25 +52,25 @@ mkdir $LIBDIR
mkdir libgpuarray/build
(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
# Finally install
# Finally install
(cd libgpuarray/build && make install)
# Export paths
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
export CPATH=$CPATH:$LIBDIR/include
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
# Build the pygpu modules
# Build the pygpu modules
(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
ls $LIBDIR
mkdir $LIBDIR/lib/python
export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
# Then install
# Then install
(cd libgpuarray && python setup.py install --home=$LIBDIR)
# Testing theano (the gpuarray parts)
python -c 'import pygpu; print(pygpu.__file__)'
# Testing theano (the gpuarray parts)
THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \
......
......@@ -27,7 +27,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined, _get_props)
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction
......@@ -46,63 +46,66 @@ def init_dev(dev, name=None):
if not config.cxx:
raise RuntimeError("The new gpu-backend need a c++ compiler.")
if (pygpu.version.major, pygpu.version.minor) < (0, 6):
raise ValueError("Your installed version of pygpu is too old, please upgrade to 0.6 or later")
raise ValueError(
"Your installed version of pygpu is too old, please upgrade to 0.6 or later")
# This is for the C headers API
if pygpu.gpuarray.api_version()[0] < 0:
raise ValueError(
"Your installed libgpuarray is too old, please update")
if dev not in init_dev.devmap:
ctx = pygpu.init(dev,
disable_alloc_cache=config.gpuarray.preallocate < 0,
single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched)
init_dev.devmap[dev] = ctx
context = pygpu.init(
dev,
disable_alloc_cache=config.gpuarray.preallocate < 0,
single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched)
context.dev = dev
init_dev.devmap[dev] = context
reg_context(name, context)
if dev.startswith('cuda'):
avail = dnn.dnn_available(name)
if avail:
context.cudnn_handle = dnn._make_handle(context)
if config.print_active_device:
if avail:
print("Using cuDNN version %d on context %s" % (dnn.version(), name),
file=sys.stderr)
else:
print("Can not use cuDNN on context %s: %s" % (name, dnn.dnn_available.msg),
file=sys.stderr)
if config.gpuarray.preallocate < 0:
print("Disabling allocation cache on %s" % (dev,))
elif config.gpuarray.preallocate > 0:
MB = (1024 * 1024)
if config.gpuarray.preallocate <= 1:
gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
gmem = min(config.gpuarray.preallocate, 0.95) * context.total_gmem
else:
gmem = config.gpuarray.preallocate * MB
if gmem > context.free_gmem - 50 * MB:
print(
"WARNING: Preallocating too much memory can prevent cudnn and cublas from working properly")
# This will allocate and immediatly free an object of size gmem
# which will reserve that amount of memory on the GPU.
pygpu.empty((gmem,), dtype='int8', context=ctx)
pygpu.empty((gmem,), dtype='int8', context=context)
if config.print_active_device:
print("Preallocating %d/%d Mb (%f) on %s" %
(gmem//MB, ctx.total_gmem//MB, gmem/ctx.total_gmem, dev),
(gmem//MB, context.total_gmem//MB,
gmem/context.total_gmem, dev),
file=sys.stderr)
context = init_dev.devmap[dev]
else:
context = init_dev.devmap[dev]
# This will map the context name to the real context object.
reg_context(name, context)
if config.print_active_device:
try:
pcibusid = context.pcibusid
pcibusid = '(' + context.pcibusid + ')'
except pygpu.gpuarray.UnsupportedException:
pcibusid = '(unsupported for device %s)' % dev
except Exception:
warnings.warn('Unable to get PCI Bus ID. Please consider updating libgpuarray and pygpu.')
pcibusid = 'unknown'
pcibusid = ''
print("Mapped name %s to device %s: %s" %
(name, dev, context.devname),
print("Mapped name %s to device %s: %s %s" %
(name, dev, context.devname, pcibusid),
file=sys.stderr)
print("PCI Bus ID:", pcibusid, file=sys.stderr)
pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'):
if 'cudnn_version' not in ctx_props:
try:
ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
......@@ -119,7 +122,8 @@ if pygpu:
elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu':
raise ValueError('you must set device=cpu to use init_gpu_device.')
raise ValueError(
'you must set device=cpu to use init_gpu_device.')
if config.contexts != '':
print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
init_dev(config.init_gpu_device)
......@@ -147,4 +151,5 @@ else:
config.device.startswith('opencl') or
config.device.startswith('cuda') or
config.contexts != ''):
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", exc_info=True)
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
exc_info=True)
......@@ -30,7 +30,7 @@ from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts,
get_prop, set_prop, GpuArraySharedVariable)
GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType)
......@@ -59,12 +59,12 @@ def _dnn_lib():
lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll', 'cudnn64_4.dll']:
for name in ['cudnn64_5.dll']:
lib_name = ctypes.util.find_library(name)
if lib_name:
break
if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])')
raise RuntimeError('Could not find cudnn library (looked for v5[.1])')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
......@@ -109,10 +109,16 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
path_wrapper = "\"" if os.name == 'nt' else ""
params = ["-l", "cudnn"]
params.extend(['-I%s%s%s' % (path_wrapper, os.path.dirname(__file__), path_wrapper)])
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
'%s%s%s' % (path_wrapper, config.nvcc.compiler_bindir, path_wrapper)])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
......@@ -129,9 +135,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
def _dnn_check_version():
v = version()
if v < 4007:
if v < 5000:
return False, "cuDNN version is too old. Update to v5, was %d." % v
# 5200 should not print warning with cudnn 5.1 final.
if version >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
return True, None
......@@ -209,14 +220,13 @@ class DnnBase(COp):
return node.outputs[0].type.context_name
def get_params(self, node):
try:
return get_prop(self.dnn_context(node), 'cudnn_handle_param')
except KeyError:
pass
ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
res = handle_type.make_value(ptr)
set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
return res
ctx_name = self.dnn_context(node)
ctx = get_context(ctx_name)
if not hasattr(ctx, 'cudnn_handle_param'):
ptr = ctx.cudnn_handle.value
res = handle_type.make_value(ptr)
ctx.cudnn_handle_param = res
return ctx.cudnn_handle_param
def __init__(self, files=None, c_func=None):
if files is None:
......@@ -301,7 +311,7 @@ def version(raises=True):
"""
if not dnn_present():
if raises:
raise Exception(
raise RuntimeError(
"We can't determine the cudnn version as it is not available",
dnn_available.msg)
else:
......@@ -500,10 +510,6 @@ class GpuDnnConv(DnnBase):
if self.inplace:
self.destroy_map = {0: [2]}
if version() < 5000 and self.algo == 'winograd':
raise RuntimeError("cuDNN winograd convolution requires "
"cuDNN v5 or more recent")
assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
......@@ -524,9 +530,9 @@ class GpuDnnConv(DnnBase):
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if self.algo == 'none': # 3d (at least in v4)
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
elif self.algo == 'small': # 3d (at least in v4)
elif self.algo == 'small': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
elif self.algo == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
......@@ -534,10 +540,9 @@ class GpuDnnConv(DnnBase):
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d (not in v4, in v5)
elif self.algo == 'fft_tiling': # 3d
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
elif self.algo == 'winograd':
# need v5
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
defs.append(('CONV_ALGO', alg))
......@@ -571,10 +576,6 @@ class GpuDnnConv(DnnBase):
if img.type.ndim == 5 and self.algo in ['large', 'fft']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (img.type.ndim == 5 and
self.algo in ['fft_tiling'] and
version() < 5000):
raise ValueError("3d convolution algo fft_tiling need cudnn v5")
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
......@@ -700,13 +701,13 @@ class GpuDnnConvGradW(DnnBase):
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'none': # 3d in at least v4
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
if self.algo == 'deterministic':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
if self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
if self.algo == 'small': # 3d in at least v4
if self.algo == 'small': # 3d
# non-deterministic, small workspace
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
if self.algo in ['guess_once', 'guess_on_shape_change',
......@@ -793,10 +794,6 @@ class GpuDnnConvGradI(DnnBase):
algo = config.dnn.conv.algo_bwd_data
self.algo = algo
if version() < 5000 and self.algo == 'winograd':
raise RuntimeError("cuDNN's winograd convolution requires cuDNN "
"v5 or more recent")
assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
'winograd', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
......@@ -832,17 +829,16 @@ class GpuDnnConvGradI(DnnBase):
defs.append(('CONV_INPLACE', '1'))
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
if self.algo == 'none': # 3d at least v4
if self.algo == 'none': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
elif self.algo == 'deterministic': # 3d at least v4
elif self.algo == 'deterministic': # 3d
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
elif self.algo == 'fft':
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
elif self.algo == 'fft_tiling': # 3d not v4, since v5
elif self.algo == 'fft_tiling': # 3d
# big workspace but less than fft
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
elif self.algo == 'winograd':
# need v5
alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
if self.algo in ['guess_once', 'guess_on_shape_change',
......@@ -877,10 +873,6 @@ class GpuDnnConvGradI(DnnBase):
if kern.type.ndim == 5 and self.algo in ['fft']:
raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,))
if (kern.type.ndim == 5 and
self.algo == 'fft_tiling' and
version() < 5000):
raise ValueError("3d convolution algo fft_tiling need cudnn v5")
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
......@@ -1316,11 +1308,7 @@ class GpuDnnPoolDesc(Op):
static const int pad[%(nd)d] = {%(pad)s};
static const int str[%(nd)d] = {%(str)s};
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str);
#else
err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, %(nd)d, win, pad, str);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
......@@ -1664,8 +1652,6 @@ class GpuDnnBatchNorm(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
'dnn_batchnorm_op')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial'))
self.mode = mode
......@@ -1724,8 +1710,6 @@ class GpuDnnBatchNormInference(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
'dnn_batchnorm_op')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial'))
self.mode = mode
......@@ -1788,8 +1772,6 @@ class GpuDnnBatchNormGrad(DnnBase):
DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
'dnn_batchnorm_grad')
if version() < 5000:
raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
assert (mode in ('per-activation', 'spatial'))
self.mode = mode
......
......@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
if (nd < 3)
nd = 3;
#if CUDNN_VERSION >= 5000
err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
#else
err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......
......@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
......
......@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardDataAlgorithm(
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice);
APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
......@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
......
......@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
......
......@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
......@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
......@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
[conv_modes[0]])),
testcase_func_name=utt.custom_name_func)
def test_conv(self, algo, border_mode, conv_mode):
if algo == 'winograd' and dnn.version(raises=False) < 5000:
raise SkipTest(dnn.dnn_available.msg)
self._test_conv(T.tensor4('img'),
T.tensor4('kerns'),
T.tensor4('out'),
......@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax):
def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
for mode in ('per-activation', 'spatial'):
......@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train():
def test_batchnorm_inference():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
for mode in ('per-activation', 'spatial'):
......
......@@ -68,7 +68,6 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name):
......@@ -97,26 +96,6 @@ def list_contexts():
"""
return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method
def _name_for_ctx(ctx):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论