Merge pull request #5357 from nouiz/abergeron-dnn_mem

Select the dnn convolution algorithm using actually available memory

Merge pull request #5357 from nouiz/abergeron-dnn_mem
c6ffa460 · Pascal Lamblin · GitHub · 59f671e2 · ec1ddad3 · c6ffa460
--- a/.jenkins/jenkins_test2.sh
+++ b/.jenkins/jenkins_test2.sh
@@ -8,7 +8,7 @@ set -x
 # Anaconda python
 export PATH=/usr/local/miniconda2/bin:$PATH

-# CUDA                                                                          
+# CUDA
 export PATH=/usr/local/cuda/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
@@ -38,13 +38,13 @@ echo "===== Testing gpuarray backend"

 GPUARRAY_CONFIG="Release"
 DEVICE=cuda0
-LIBDIR=~/tmp/local
+LIBDIR=${WORKSPACE}/local

 # Make fresh clones of libgpuarray (with no history since we don't need it)
 rm -rf libgpuarray
 git clone --depth 1 "https://github.com/Theano/libgpuarray.git"

-# Clean up previous installs (to make sure no old files are left) 
+# Clean up previous installs (to make sure no old files are left)
 rm -rf $LIBDIR
 mkdir $LIBDIR

@@ -52,25 +52,25 @@ mkdir $LIBDIR
 mkdir libgpuarray/build
 (cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)

-# Finally install                                                               
+# Finally install
 (cd libgpuarray/build && make install)

 # Export paths
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib64/
-export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib64/
 export CPATH=$CPATH:$LIBDIR/include
 export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib

-# Build the pygpu modules                                                       
+# Build the pygpu modules
 (cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
 ls $LIBDIR
 mkdir $LIBDIR/lib/python
 export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
-# Then install                                                                  
+# Then install
 (cd libgpuarray && python setup.py install --home=$LIBDIR)

-# Testing theano (the gpuarray parts)                                           
+python -c 'import pygpu; print(pygpu.__file__)'
+
+# Testing theano (the gpuarray parts)
 THEANO_GPUARRAY_TESTS="theano/gpuarray/tests \
                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_serial \
                       theano/sandbox/tests/test_rng_mrg.py:test_consistency_GPUA_parallel \

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -27,7 +27,7 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined, _get_props)
+                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
 from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction

@@ -46,63 +46,66 @@ def init_dev(dev, name=None):
    if not config.cxx:
        raise RuntimeError("The new gpu-backend need a c++ compiler.")
    if (pygpu.version.major, pygpu.version.minor) < (0, 6):
-        raise ValueError("Your installed version of pygpu is too old, please upgrade to 0.6 or later")
+        raise ValueError(
+            "Your installed version of pygpu is too old, please upgrade to 0.6 or later")
+    # This is for the C headers API
+    if pygpu.gpuarray.api_version()[0] < 0:
+        raise ValueError(
+            "Your installed libgpuarray is too old, please update")
    if dev not in init_dev.devmap:
-        ctx = pygpu.init(dev,
-                         disable_alloc_cache=config.gpuarray.preallocate < 0,
-                         single_stream=config.gpuarray.single_stream,
-                         sched=config.gpuarray.sched)
-        init_dev.devmap[dev] = ctx
+        context = pygpu.init(
+            dev,
+            disable_alloc_cache=config.gpuarray.preallocate < 0,
+            single_stream=config.gpuarray.single_stream,
+            sched=config.gpuarray.sched)
+        context.dev = dev
+        init_dev.devmap[dev] = context
+        reg_context(name, context)
+
+        if dev.startswith('cuda'):
+            avail = dnn.dnn_available(name)
+            if avail:
+                context.cudnn_handle = dnn._make_handle(context)
+            if config.print_active_device:
+                if avail:
+                    print("Using cuDNN version %d on context %s" % (dnn.version(), name),
+                          file=sys.stderr)
+                else:
+                    print("Can not use cuDNN on context %s: %s" % (name, dnn.dnn_available.msg),
+                          file=sys.stderr)
        if config.gpuarray.preallocate < 0:
            print("Disabling allocation cache on %s" % (dev,))
        elif config.gpuarray.preallocate > 0:
            MB = (1024 * 1024)
            if config.gpuarray.preallocate <= 1:
-                gmem = min(config.gpuarray.preallocate, 0.95) * ctx.total_gmem
+                gmem = min(config.gpuarray.preallocate, 0.95) * context.total_gmem
            else:
                gmem = config.gpuarray.preallocate * MB
+            if gmem > context.free_gmem - 50 * MB:
+                print(
+                    "WARNING: Preallocating too much memory can prevent cudnn and cublas from working properly")
+
            # This will allocate and immediatly free an object of size gmem
            # which will reserve that amount of memory on the GPU.
-            pygpu.empty((gmem,), dtype='int8', context=ctx)
+            pygpu.empty((gmem,), dtype='int8', context=context)
            if config.print_active_device:
                print("Preallocating %d/%d Mb (%f) on %s" %
-                      (gmem//MB, ctx.total_gmem//MB, gmem/ctx.total_gmem, dev),
+                      (gmem//MB, context.total_gmem//MB,
+                       gmem/context.total_gmem, dev),
                      file=sys.stderr)
-    context = init_dev.devmap[dev]
+    else:
+        context = init_dev.devmap[dev]
    # This will map the context name to the real context object.
-    reg_context(name, context)
    if config.print_active_device:
        try:
-            pcibusid = context.pcibusid
+            pcibusid = '(' + context.pcibusid + ')'
        except pygpu.gpuarray.UnsupportedException:
-            pcibusid = '(unsupported for device %s)' % dev
-        except Exception:
-            warnings.warn('Unable to get PCI Bus ID. Please consider updating libgpuarray and pygpu.')
-            pcibusid = 'unknown'
+            pcibusid = ''

-        print("Mapped name %s to device %s: %s" %
-              (name, dev, context.devname),
+        print("Mapped name %s to device %s: %s %s" %
+              (name, dev, context.devname, pcibusid),
              file=sys.stderr)
-        print("PCI Bus ID:", pcibusid, file=sys.stderr)
    pygpu_activated = True
-    ctx_props = _get_props(name)
-    ctx_props['dev'] = dev
-    if dev.startswith('cuda'):
-        if 'cudnn_version' not in ctx_props:
-            try:
-                ctx_props['cudnn_version'] = dnn.version()
-                # 5200 should not print warning with cudnn 5.1 final.
-                if ctx_props['cudnn_version'] >= 5200:
-                    warnings.warn("Your cuDNN version is more recent than "
-                                  "Theano. If you encounter problems, try "
-                                  "updating Theano or downgrading cuDNN to "
-                                  "version 5.1.")
-                if config.print_active_device:
-                    print("Using cuDNN version %d on context %s" %
-                          (ctx_props['cudnn_version'], name), file=sys.stderr)
-                ctx_props['cudnn_handle'] = dnn._make_handle(context)
-            except Exception:
-                pass

 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}
@@ -119,7 +122,8 @@ if pygpu:
        elif (config.init_gpu_device.startswith('cuda') or
              config.init_gpu_device.startswith('opencl')):
            if config.device != 'cpu':
-                raise ValueError('you must set device=cpu to use init_gpu_device.')
+                raise ValueError(
+                    'you must set device=cpu to use init_gpu_device.')
            if config.contexts != '':
                print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
            init_dev(config.init_gpu_device)
@@ -147,4 +151,5 @@ else:
            config.device.startswith('opencl') or
            config.device.startswith('cuda') or
            config.contexts != ''):
-        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)", exc_info=True)
+        error("pygpu was configured but could not be imported or is too old (version 0.6 or higher required)",
+              exc_info=True)
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -30,7 +30,7 @@ from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
 from .type import (get_context, gpu_context_type, list_contexts,
-                   get_prop, set_prop, GpuArraySharedVariable)
+                   GpuArraySharedVariable)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, gpu_alloc_empty,
                        empty_like, GpuArrayType)
@@ -59,12 +59,12 @@ def _dnn_lib():
        lib_name = ctypes.util.find_library('cudnn')
        if lib_name is None and sys.platform == 'win32':
            # Update these names when new versions of cudnn are supported.
-            for name in ['cudnn64_5.dll', 'cudnn64_4.dll']:
+            for name in ['cudnn64_5.dll']:
                lib_name = ctypes.util.find_library(name)
                if lib_name:
                    break
        if lib_name is None:
-            raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])')
+            raise RuntimeError('Could not find cudnn library (looked for v5[.1])')
        _dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
        cudnn = _dnn_lib.handle
        cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
@@ -109,10 +109,16 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
 """

    params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
+    path_wrapper = "\"" if os.name == 'nt' else ""
+    params = ["-l", "cudnn"]
+    params.extend(['-I%s%s%s' % (path_wrapper, os.path.dirname(__file__), path_wrapper)])
    if config.dnn.include_path:
-        params.append("-I" + config.dnn.include_path)
+        params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
    if config.dnn.library_path:
-        params.append("-L" + config.dnn.library_path)
+        params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
+    if config.nvcc.compiler_bindir:
+        params.extend(['--compiler-bindir',
+                       '%s%s%s' % (path_wrapper, config.nvcc.compiler_bindir, path_wrapper)])
    # Do not run here the test program. It would run on the
    # default gpu, not the one selected by the user. If mixed
    # GPU are installed or if the GPUs are configured in
@@ -129,9 +135,14 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {

 def _dnn_check_version():
    v = version()
-    if v < 4007:
+    if v < 5000:
        return False, "cuDNN version is too old. Update to v5, was %d." % v
-
+    # 5200 should not print warning with cudnn 5.1 final.
+    if version >= 5200:
+        warnings.warn("Your cuDNN version is more recent than "
+                      "Theano. If you encounter problems, try "
+                      "updating Theano or downgrading cuDNN to "
+                      "version 5.1.")
    return True, None


@@ -209,14 +220,13 @@ class DnnBase(COp):
        return node.outputs[0].type.context_name

    def get_params(self, node):
-        try:
-            return get_prop(self.dnn_context(node), 'cudnn_handle_param')
-        except KeyError:
-            pass
-        ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
-        res = handle_type.make_value(ptr)
-        set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
-        return res
+        ctx_name = self.dnn_context(node)
+        ctx = get_context(ctx_name)
+        if not hasattr(ctx, 'cudnn_handle_param'):
+            ptr = ctx.cudnn_handle.value
+            res = handle_type.make_value(ptr)
+            ctx.cudnn_handle_param = res
+        return ctx.cudnn_handle_param

    def __init__(self, files=None, c_func=None):
        if files is None:
@@ -301,7 +311,7 @@ def version(raises=True):
    """
    if not dnn_present():
        if raises:
-            raise Exception(
+            raise RuntimeError(
                "We can't determine the cudnn version as it is not available",
                dnn_available.msg)
        else:
@@ -500,10 +510,6 @@ class GpuDnnConv(DnnBase):
        if self.inplace:
            self.destroy_map = {0: [2]}

-        if version() < 5000 and self.algo == 'winograd':
-            raise RuntimeError("cuDNN winograd convolution requires "
-                               "cuDNN v5 or more recent")
-
        assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
                             'winograd', 'guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']
@@ -524,9 +530,9 @@ class GpuDnnConv(DnnBase):
            defs.append(('CONV_INPLACE', '1'))

        alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
-        if self.algo == 'none':  # 3d (at least in v4)
+        if self.algo == 'none':  # 3d
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
-        elif self.algo == 'small':  # 3d (at least in v4)
+        elif self.algo == 'small':  # 3d
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
        elif self.algo == 'large':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
@@ -534,10 +540,9 @@ class GpuDnnConv(DnnBase):
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_DIRECT'
        elif self.algo == 'fft':
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
-        elif self.algo == 'fft_tiling':  # 3d (not in v4, in v5)
+        elif self.algo == 'fft_tiling':  # 3d
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
        elif self.algo == 'winograd':
-            # need v5
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
        defs.append(('CONV_ALGO', alg))

@@ -571,10 +576,6 @@ class GpuDnnConv(DnnBase):
        if img.type.ndim == 5 and self.algo in ['large', 'fft']:
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))
-        if (img.type.ndim == 5 and
-                self.algo in ['fft_tiling'] and
-                version() < 5000):
-            raise ValueError("3d convolution algo fft_tiling need cudnn v5")

        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
@@ -700,13 +701,13 @@ class GpuDnnConvGradW(DnnBase):
            defs.append(('CONV_INPLACE', '1'))

        alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
-        if self.algo == 'none':  # 3d in at least v4
+        if self.algo == 'none':  # 3d
            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
        if self.algo == 'deterministic':
            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
        if self.algo == 'fft':
            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
-        if self.algo == 'small':  # 3d in at least v4
+        if self.algo == 'small':  # 3d
            # non-deterministic, small workspace
            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3'
        if self.algo in ['guess_once', 'guess_on_shape_change',
@@ -793,10 +794,6 @@ class GpuDnnConvGradI(DnnBase):
            algo = config.dnn.conv.algo_bwd_data
        self.algo = algo

-        if version() < 5000 and self.algo == 'winograd':
-            raise RuntimeError("cuDNN's winograd convolution requires cuDNN "
-                               "v5 or more recent")
-
        assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
                             'winograd', 'guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']
@@ -832,17 +829,16 @@ class GpuDnnConvGradI(DnnBase):
            defs.append(('CONV_INPLACE', '1'))

        alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
-        if self.algo == 'none':  # 3d at least v4
+        if self.algo == 'none':  # 3d
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
-        elif self.algo == 'deterministic':  # 3d at least v4
+        elif self.algo == 'deterministic':  # 3d
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
        elif self.algo == 'fft':
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
-        elif self.algo == 'fft_tiling':  # 3d not v4, since v5
+        elif self.algo == 'fft_tiling':  # 3d
            # big workspace but less than fft
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
        elif self.algo == 'winograd':
-            # need v5
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'

        if self.algo in ['guess_once', 'guess_on_shape_change',
@@ -877,10 +873,6 @@ class GpuDnnConvGradI(DnnBase):
        if kern.type.ndim == 5 and self.algo in ['fft']:
            raise ValueError("convolution algo %s can't be used for "
                             "3d convolutions", (self.algo,))
-        if (kern.type.ndim == 5 and
-                self.algo == 'fft_tiling' and
-                version() < 5000):
-            raise ValueError("3d convolution algo fft_tiling need cudnn v5")

        if (not isinstance(desc.type, CDataType) or
                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
@@ -1316,11 +1308,7 @@ class GpuDnnPoolDesc(Op):
  static const int pad[%(nd)d] = {%(pad)s};
  static const int str[%(nd)d] = {%(str)s};

-#if CUDNN_VERSION >= 5000
    err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str);
-#else
-    err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, %(nd)d, win, pad, str);
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
@@ -1664,8 +1652,6 @@ class GpuDnnBatchNorm(DnnBase):
        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm.c'],
                         'dnn_batchnorm_op')

-        if version() < 5000:
-            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
        assert (mode in ('per-activation', 'spatial'))
        self.mode = mode

@@ -1724,8 +1710,6 @@ class GpuDnnBatchNormInference(DnnBase):
        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_inf.c'],
                         'dnn_batchnorm_op')

-        if version() < 5000:
-            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
        assert (mode in ('per-activation', 'spatial'))
        self.mode = mode

@@ -1788,8 +1772,6 @@ class GpuDnnBatchNormGrad(DnnBase):
        DnnBase.__init__(self, ['dnn_batchnorm_base.c', 'dnn_batchnorm_grad.c'],
                         'dnn_batchnorm_grad')

-        if version() < 5000:
-            raise RuntimeError("cuDNN Batch Normalization requires cuDNN v5 or later")
        assert (mode in ('per-activation', 'spatial'))
        self.mode = mode


--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
@@ -115,11 +115,7 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  if (nd < 3)
    nd = 3;

-#if CUDNN_VERSION >= 5000
    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
-#else
-    err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,

--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -98,12 +98,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
 #endif

  if (!reuse_algo) {
+    size_t free;
+
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
-    err = cudnnFindConvolutionForwardAlgorithm(
-      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
-      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
+    gpudata *tmpmem;
+
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+    // We don't sync the buffer as we don't care about the values.
+    err = cudnnFindConvolutionForwardAlgorithmEx(
+      _handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
+      desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
+      1, &count, &choice, *(void **)tmpmem,
+      free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -114,16 +139,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionForwardAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
 #endif

  if (!reuse_algo) {
+    size_t free;
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdDataAlgoPerf_t choice;
+    gpudata *tmpmem;

-    err = cudnnFindConvolutionBackwardDataAlgorithm(
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+
+    err = cudnnFindConvolutionBackwardDataAlgorithmEx(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(input), 1, &count, &choice);
+      APPLY_SPECIFIC(input), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
@@ -157,16 +178,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,

    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionBackwardDataAlgorithm(
      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -140,13 +140,34 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
 #endif

  if (!reuse_algo) {
+    size_t free;
+
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
+    if (err2 != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    // Guess 4Mb if the info is not available
+    if (free == 0) free = 4 * 1024 * 1024;
+
 #ifdef CHOOSE_TIME
    int count;
    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+    gpudata *tmpmem;

-    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+    tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
+    if (tmpmem == NULL) {
+      PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
+      return -1;
+    }
+
+    err = cudnnFindConvolutionBackwardFilterAlgorithmEx(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
-      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+      APPLY_SPECIFIC(kerns), 1, &count, &choice, *(void **)tmpmem, free);
+    gpudata_release(tmpmem);

    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -158,16 +179,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,

    algo = choice.algo;
 #else
-    size_t free;
-    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);
-
-    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU");
-      cuda_exit(c->ctx);
-      return 1;
-    }
-
    err = cudnnGetConvolutionBackwardFilterAlgorithm(
      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -71,11 +71,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }

-#if CUDNN_VERSION >= 5000
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-#else
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -111,11 +111,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }

-#if CUDNN_VERSION >= 5000
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
-#else
-  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
-#endif

  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -604,9 +604,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
                                        [conv_modes[0]])),
                          testcase_func_name=utt.custom_name_func)
    def test_conv(self, algo, border_mode, conv_mode):
-        if algo == 'winograd' and dnn.version(raises=False) < 5000:
-            raise SkipTest(dnn.dnn_available.msg)
-
        self._test_conv(T.tensor4('img'),
                        T.tensor4('kerns'),
                        T.tensor4('out'),
@@ -1361,8 +1358,6 @@ class test_SoftMax(test_nnet.test_SoftMax):
 def test_dnn_batchnorm_train():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
-    if dnn.version(raises=False) < 5000:
-        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

    for mode in ('per-activation', 'spatial'):
@@ -1416,8 +1411,6 @@ def test_dnn_batchnorm_train():
 def test_batchnorm_inference():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
-    if dnn.version(raises=False) < 5000:
-        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

    for mode in ('per-activation', 'spatial'):

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -68,7 +68,6 @@ def reg_context(name, ctx):
    if not isinstance(ctx, gpuarray.GpuContext):
        raise TypeError("context is not GpuContext")
    _context_reg[name] = ctx
-    _props_map[ctx] = dict()


 def get_context(name):
@@ -97,26 +96,6 @@ def list_contexts():
    """
    return _context_reg.keys()

-# Mappings of properties to contexts.  Please never use this if you
-# can avoid it.
-
-# This is basically a way to store "global" variables that depend on
-# the context.
-_props_map = {}
-
-
-def _get_props(name):
-    ctx = get_context(name)
-    return _props_map[ctx]
-
-
-def get_prop(name, k):
-    return _get_props(name)[k]
-
-
-def set_prop(name, k, v):
-    _get_props(name)[k] = v
-

 # Private method
 def _name_for_ctx(ctx):