提交 c6ffa460 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #5357 from nouiz/abergeron-dnn_mem

Select the dnn convolution algorithm using actually available memory
......@@ -38,7 +38,7 @@ echo "===== Testing gpuarray backend"
GPUARRAY_CONFIG="Release"
DEVICE=cuda0
LIBDIR=~/tmp/local
LIBDIR=${WORKSPACE}/local
# Make fresh clones of libgpuarray (with no history since we don't need it)
rm -rf libgpuarray
......@@ -56,8 +56,6 @@ mkdir libgpuarray/build
(cd libgpuarray/build && make install)
# Export paths
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
export CPATH=$CPATH:$LIBDIR/include
export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
......@@ -70,6 +68,8 @@ export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
# Then install
(cd libgpuarray && python setup.py install --home=$LIBDIR)
python -c 'import pygpu; print(pygpu.__file__)'
# Testing theano (the gpuarray parts)
THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
......
......@@ -27,7 +27,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined, _get_props)
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction
......@@ -46,63 +46,66 @@ def init_dev(dev, name=None):
if not config.cxx:
raise RuntimeError("The new gpu-backend need a c++ compiler.")
if (pygpu.version.major, pygpu.version.minor) < (0, 6):
raise ValueError("Your installed version of pygpu is too old, please upgrade to 0.6 or later")
raise ValueError(
"Your installed version of pygpu is too old, please upgrade to 0.6 or later")
# This is for the C headers API
if pygpu.gpuarray.api_version()[0] < 0:
raise ValueError(
"Your installed libgpuarray is too old, please update")
if dev not in init_dev.devmap:
ctx = pygpu.init(dev,
context = pygpu.init(
dev,
disable_alloc_cache=config.gpuarray.preallocate < 0,
single_stream=config.gpuarray.single_stream,
sched=config.gpuarray.sched)
init_dev.devmap[dev] = ctx
context.dev = dev
init_dev.devmap[dev] = context
reg_context(name, context)
if dev.startswith('cuda'):
avail = dnn.dnn_available(name)
if avail:
context.cudnn_handle = dnn._make_handle(context)
if config.print_active_device:
if avail:
print("Using cuDNN version %d on context %s" % (dnn.version(), name),
file=sys.stderr)
else:
print("Can not use cuDNN on context %s: %s" % (name, dnn.dnn_available.msg),
file=sys.stderr)
if config.gpuarray.preallocate < 0:
print("Disabling allocation cache on %s" % (dev,))
elif config.gpuarray.preallocate > 0:
MB = (1024 * 1024)
if config.gpuarray.preallocate <= 1:
gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
gmem = min(config.gpuarray.preallocate, 0.95) * context.total_gmem
else:
gmem = config.gpuarray.preallocate * MB
if gmem > context.free_gmem - 50 * MB:
print(
"WARNING: Preallocating too much memory can prevent cudnn and cublas from working properly")
# This will allocate and immediatly free an object of size gmem
# which will reserve that amount of memory on the GPU.
pygpu.empty((gmem,), dtype='int8', context=ctx)
pygpu.empty((gmem,), dtype='int8', context=context)
if config.print_active_device:
print("Preallocating %d/%d Mb (%f) on %s" %
(gmem//MB, ctx.total_gmem//MB, gmem/ctx.total_gmem, dev),
(gmem//MB, context.total_gmem//MB,
gmem/context.total_gmem, dev),
file=sys.stderr)
else:
context = init_dev.devmap[dev]
# This will map the context name to the real context object.
reg_context(name, context)
if config.print_active_device:
try:
pcibusid = context.pcibusid
pcibusid = '(' + context.pcibusid + ')'
except pygpu.gpuarray.UnsupportedException:
pcibusid = '(unsupported for device %s)' % dev
except Exception:
warnings.warn('Unable to get PCI Bus ID. Please consider updating libgpuarray and pygpu.')
pcibusid = 'unknown'
pcibusid = ''
print("Mapped name %s to device %s: %s" %
(name, dev, context.devname),
print("Mapped name %s to device %s: %s %s" %
(name, dev, context.devname, pcibusid),
file=sys.stderr)
print("PCI Bus ID:", pcibusid, file=sys.stderr)
pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'):
if 'cudnn_version' not in ctx_props:
try:
ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
......@@ -119,7 +122,8 @@ if pygpu:
elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu':
raise ValueError('you must set device=cpu to use init_gpu_device.')
raise ValueError(
'you must set device=cpu to use init_gpu_device.')
if config.contexts != '':
print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
init_dev(config.init_gpu_device)
......@@ -147,4 +151,5 @@ else:
config.device.startswith('opencl') or
config.device.startswith('cuda') or
config.contexts != ''):
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", exc_info=True)
error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
exc_info=True)
差异被折叠。
......@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
if (nd < 3)
nd = 3;
#if CUDNN_VERSION >= 5000
err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
#else
err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......
......@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
_handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
......
......@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardDataAlgorithm(
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice);
APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
......@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
......
......@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif
if (!reuse_algo) {
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = choice.algo;
#else
size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
cuda_exit(c->ctx);
return 1;
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
......
......@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
......@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
}
#if CUDNN_VERSION >= 5000
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
#else
err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
......
......@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
[conv_modes[0]])),
testcase_func_name=utt.custom_name_func)
def test_conv(self, algo, border_mode, conv_mode):
if algo == 'winograd' and dnn.version(raises=False) < 5000:
raise SkipTest(dnn.dnn_available.msg)
self._test_conv(T.tensor4('img'),
T.tensor4('kerns'),
T.tensor4('out'),
......@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax):
def test_dnn_batchnorm_train():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
for mode in ('per-activation', 'spatial'):
......@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train():
def test_batchnorm_inference():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
if dnn.version(raises=False) < 5000:
raise SkipTest("batch normalization requires cudnn v5+")
utt.seed_rng()
for mode in ('per-activation', 'spatial'):
......
......@@ -68,7 +68,6 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name):
......@@ -97,26 +96,6 @@ def list_contexts():
"""
return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method
def _name_for_ctx(ctx):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论