Merge pull request #5357 from nouiz/abergeron-dnn_mem

Select the dnn convolution algorithm using actually available memory

Merge pull request #5357 from nouiz/abergeron-dnn_mem
c6ffa460 · Pascal Lamblin · GitHub · 59f671e2 · ec1ddad3 · c6ffa460
--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
@@ -8,7 +8,7 @@ set -x
 # Anaconda python
 export PATH=/usr/local/miniconda2/bin:$PATH
-# CUDA                                                                          
+# CUDA
 export PATH=/usr/local/cuda/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
@@ -38,13 +38,13 @@ echo "===== Testing gpuarray backend"
 GPUARRAY_CONFIG="Release"
 DEVICE=cuda0
-LIBDIR=~/tmp/local
+LIBDIR=${WORKSPACE}/local
 # Make fresh clones of libgpuarray (with no history since we don't need it)
 rm -rf libgpuarray
 git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
-# Clean up previous installs (to make sure no old files are left) 
+# Clean up previous installs (to make sure no old files are left)
 rm -rf $LIBDIR
 mkdir $LIBDIR
@@ -52,25 +52,25 @@ mkdir $LIBDIR
 mkdir libgpuarray/build
 (cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
-# Finally install                                                               
+# Finally install
 (cd libgpuarray/build && make install)
 # Export paths
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
-export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
 export CPATH=$CPATH:$LIBDIR/include
 export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
-# Build the pygpu modules                                                       
+# Build the pygpu modules
 (cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
 ls $LIBDIR
 mkdir $LIBDIR/lib/python
 export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
-# Then install                                                                  
+# Then install
 (cd libgpuarray && python setup.py install --home=$LIBDIR)
-# Testing theano (the gpuarray parts)                                           
+python -c 'import pygpu; print(pygpu.__file__)'
+# Testing theano (the gpuarray parts)
 THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -27,7 +27,7 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined, _get_props)
+                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
 from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction
@@ -46,63 +46,66 @@ def init_dev(dev, name=None):
    if not config.cxx:
        raise RuntimeError("The new gpu-backend need a c++ compiler.")
    if (pygpu.version.major, pygpu.version.minor) < (0, 6):
-        raise ValueError("Your installed version of pygpu is too old, please upgrade to 0.6 or later")
+        raise ValueError(
+            "Your installed version of pygpu is too old, please upgrade to 0.6 or later")
+    # This is for the C headers API
+    if pygpu.gpuarray.api_version()[0] < 0:
+        raise ValueError(
+            "Your installed libgpuarray is too old, please update")
    if dev not in init_dev.devmap:
-        ctx = pygpu.init(dev,
+        context = pygpu.init(
-                         disable_alloc_cache=config.gpuarray.preallocate < 0,
+            dev,
-                         single_stream=config.gpuarray.single_stream,
+            disable_alloc_cache=config.gpuarray.preallocate < 0,
-                         sched=config.gpuarray.sched)
+            single_stream=config.gpuarray.single_stream,
-        init_dev.devmap[dev] = ctx
+            sched=config.gpuarray.sched)
+        context.dev = dev
+        init_dev.devmap[dev] = context
+        reg_context(name, context)
+        if dev.startswith('cuda'):
+            avail = dnn.dnn_available(name)
+            if avail:
+                context.cudnn_handle = dnn._make_handle(context)
+            if config.print_active_device:
+                if avail:
+                    print("Using cuDNN version %d on context %s" % (dnn.version(), name),
+                          file=sys.stderr)
+                else:
+                    print("Can not use cuDNN on context %s: %s" % (name, dnn.dnn_available.msg),
+                          file=sys.stderr)
        if config.gpuarray.preallocate < 0:
            print("Disabling allocation cache on %s" % (dev,))
        elif config.gpuarray.preallocate > 0:
            MB = (1024 * 1024)
            if config.gpuarray.preallocate <= 1:
-                gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
+                gmem = min(config.gpuarray.preallocate, 0.95) * context.total_gmem
            else:
                gmem = config.gpuarray.preallocate * MB
+            if gmem > context.free_gmem - 50 * MB:
+                print(
+                    "WARNING: Preallocating too much memory can prevent cudnn and cublas from working properly")
            # This will allocate and immediatly free an object of size gmem
            # which will reserve that amount of memory on the GPU.
-            pygpu.empty((gmem,), dtype='int8', context=ctx)
+            pygpu.empty((gmem,), dtype='int8', context=context)
            if config.print_active_device:
                print("Preallocating %d/%d Mb (%f) on %s" %
-                      (gmem//MB, ctx.total_gmem//MB, gmem/ctx.total_gmem, dev),
+                      (gmem//MB, context.total_gmem//MB,
+                       gmem/context.total_gmem, dev),
                      file=sys.stderr)
-    context = init_dev.devmap[dev]
+    else:
+        context = init_dev.devmap[dev]
    # This will map the context name to the real context object.
-    reg_context(name, context)
    if config.print_active_device:
        try:
-            pcibusid = context.pcibusid
+            pcibusid = '(' + context.pcibusid + ')'
        except pygpu.gpuarray.UnsupportedException:
-            pcibusid = '(unsupported for device %s)' % dev
+            pcibusid = ''
-        except Exception:
-            warnings.warn('Unable to get PCI Bus ID. Please consider updating libgpuarray and pygpu.')
-            pcibusid = 'unknown'
-        print("Mapped name %s to device %s: %s" %
+        print("Mapped name %s to device %s: %s %s" %
-              (name, dev, context.devname),
+              (name, dev, context.devname, pcibusid),
              file=sys.stderr)
-        print("PCI Bus ID:", pcibusid, file=sys.stderr)
    pygpu_activated = True
-    ctx_props = _get_props(name)
-    ctx_props['dev'] = dev
-    if dev.startswith('cuda'):
-        if 'cudnn_version' not in ctx_props:
-            try:
-                ctx_props['cudnn_version'] = dnn.version()
-                # 5200 should not print warning with cudnn 5.1 final.
-                if ctx_props['cudnn_version'] >= 5200:
-                    warnings.warn("Your cuDNN version is more recent than "
-                                  "Theano. If you encounter problems, try "
-                                  "updating Theano or downgrading cuDNN to "
-                                  "version 5.1.")
-                if config.print_active_device:
-                    print("Using cuDNN version %d on context %s" %
-                          (ctx_props['cudnn_version'], name), file=sys.stderr)
-                ctx_props['cudnn_handle'] = dnn._make_handle(context)
-            except Exception:
-                pass
 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}
@@ -119,7 +122,8 @@ if pygpu:
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
            if config.device != 'cpu':
-                raise ValueError('you must set device=cpu to use init_gpu_device.')
+                raise ValueError(
+                    'you must set device=cpu to use init_gpu_device.')
            if config.contexts != '':
                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
            init_dev(config.init_gpu_device)
@@ -147,4 +151,5 @@ else:
            config.device.startswith('opencl') or
            config.device.startswith('cuda') or
            config.contexts != ''):
-        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", exc_info=True)
+        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
+              exc_info=True)
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  if (nd < 3)
    nd = 3;
-#if CUDNN_VERSION >= 5000
    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
-#else
-    err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
-#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 #endif
  if (!reuse_algo) {
+    size_t free;
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
-    err = cudnnFindConvolutionForwardAlgorithm(
+    gpudata *tmpmem;
-      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+    // We don't sync the buffer as we don't care about the values.
+    err = cudnnFindConvolutionForwardAlgorithmEx(
+      _handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+      desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
+      1, &count, &choice, *(void **)tmpmem,
+      free);
+    gpudata_release(tmpmem);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
    err = cudnnGetConvolutionForwardAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
 #endif
  if (!reuse_algo) {
+    size_t free;
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdDataAlgoPerf_t choice;
+    gpudata *tmpmem;
-    err = cudnnFindConvolutionBackwardDataAlgorithm(
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+    err = cudnnFindConvolutionBackwardDataAlgorithmEx(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(input), 1, &count, &choice);
+      APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
    err = cudnnGetConvolutionBackwardDataAlgorithm(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 #endif
  if (!reuse_algo) {
+    size_t free;
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+    gpudata *tmpmem;
-    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+    err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+      APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }
-#if CUDNN_VERSION >= 5000
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-#else
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }
-#if CUDNN_VERSION >= 5000
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-#else
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
                                        [conv_modes[0]])),
                          testcase_func_name=utt.custom_name_func)
    def test_conv(self, algo, border_mode, conv_mode):
-        if algo == 'winograd' and dnn.version(raises=False) < 5000:
-            raise SkipTest(dnn.dnn_available.msg)
        self._test_conv(T.tensor4('img'),
                        T.tensor4('kerns'),
                        T.tensor4('out'),
@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax):
 def test_dnn_batchnorm_train():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
-    if dnn.version(raises=False) < 5000:
-        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()
    for mode in ('per-activation', 'spatial'):
@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train():
 def test_batchnorm_inference():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
-    if dnn.version(raises=False) < 5000:
-        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()
    for mode in ('per-activation', 'spatial'):

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -68,7 +68,6 @@ def reg_context(name, ctx):
    if not isinstance(ctx, gpuarray.GpuContext):
        raise TypeError("context is not GpuContext")
    _context_reg[name] = ctx
-    _props_map[ctx] = dict()
 def get_context(name):
@@ -97,26 +96,6 @@ def list_contexts():
    """
    return _context_reg.keys()
-# Mappings of properties to contexts.  Please never use this if you
-# can avoid it.
-# This is basically a way to store "global" variables that depend on
-# the context.
-_props_map = {}
-def _get_props(name):
-    ctx = get_context(name)
-    return _props_map[ctx]
-def get_prop(name, k):
-    return _get_props(name)[k]
-def set_prop(name, k, v):
-    _get_props(name)[k] = v
 # Private method
 def _name_for_ctx(ctx):