Merge pull request #4344 from mgermain/cudnn5

Added cuDNN v5 support & Added optional dependencies to setup.py Conflicts: .travis.yml doc/install.txt

Merge pull request #4344 from mgermain/cudnn5
ef0fc56e · Frédéric Bastien · Frederic Bastien · 02453383 · ef0fc56e · ef0fc56e
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,6 +37,7 @@ install:
  - source activate pyenv
  - if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install pydot; fi
  - pip install . --no-deps
+  - pip install nose-parameterized==0.5.0
 # command to run tests
 env:

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -49,7 +49,7 @@ instructions below for detailed installation steps):
 The following libraries and software are optional:
-    `nose <http://somethingaboutorange.com/mrl/projects/nose/>`_ >= 1.3.0
+    `nose <http://nose.readthedocs.org/en/latest/>`_ >= 1.3.0 and `nose-parameterized <https://pypi.python.org/pypi/nose-parameterized/>`_ >= 0.5.0
        Recommended, to run Theano's test-suite.
    `Sphinx <http://sphinx.pocoo.org/>`_ >= 0.5.1, `pygments <http://pygments.org/>`_

--- a/requirement-rtd.txt
+++ b/requirement-rtd.txt
 sphinx>=1.3.0
+pygments
 nose>=1.3.0
+nose-parameterized>=0.5.0
--- a/setup.py
+++ b/setup.py
@@ -163,6 +163,11 @@ def do_setup():
          packages=find_packages(),
          # 1.7.0 give too much warning related to numpy.diagonal.
          install_requires=['numpy>=1.7.1', 'scipy>=0.11', 'six>=1.9.0'],
+          # pygments is a dependency for Sphinx code highlight
+          extras_require={
+              'test': ['nose>=1.3.0', 'nose-parameterized>=0.5.0'],
+              'doc': ['Sphinx>=0.5.1', 'pygments']
+          },
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
                   '*.h', '*.cpp', 'ChangeLog'],

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -286,6 +286,20 @@ def safe_no_dnn_algo_bwd(algo):
            '`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
    return True
+# Those are the supported algorithm by Theano,
+# The tests will reference those lists.
+SUPPORTED_DNN_CONV_ALGO_FWD = ('small', 'none', 'large', 'fft', 'fft_tiling',
+                               'winograd', 'guess_once', 'guess_on_shape_change',
+                               'time_once', 'time_on_shape_change')
+SUPPORTED_DNN_CONV_ALGO_BWD_DATA = ('none', 'deterministic', 'fft', 'fft_tiling',
+                                    'winograd', 'guess_once', 'guess_on_shape_change',
+                                    'time_once', 'time_on_shape_change')
+SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = ('none', 'deterministic', 'fft', 'small',
+                                      'guess_once', 'guess_on_shape_change',
+                                      'time_once', 'time_on_shape_change')
 AddConfigVar('dnn.conv.algo_bwd',
             "This flag is deprecated; use dnn.conv.algo_bwd_data and "
             "dnn.conv.algo_bwd_filter.",
@@ -295,26 +309,20 @@ AddConfigVar('dnn.conv.algo_bwd',
 AddConfigVar('dnn.conv.algo_fwd',
             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'fft_tiling',
+             EnumStr(*SUPPORTED_DNN_CONV_ALGO_FWD),
-                     'guess_once', 'guess_on_shape_change',
-                     'time_once', 'time_on_shape_change'),
             in_c_key=False)
 AddConfigVar('dnn.conv.algo_bwd_data',
             "Default implementation to use for CuDNN backward convolution to "
             "get the gradients of the convolution with regard to the inputs.",
-             EnumStr('none', 'deterministic', 'fft', 'fft_tiling',
+             EnumStr(*SUPPORTED_DNN_CONV_ALGO_BWD_DATA),
-                     'guess_once', 'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
             in_c_key=False)
 AddConfigVar('dnn.conv.algo_bwd_filter',
             "Default implementation to use for CuDNN backward convolution to "
             "get the gradients of the convolution with regard to the "
             "filters.",
-             EnumStr('none', 'deterministic', 'fft', 'small', 'guess_once',
+             EnumStr(*SUPPORTED_DNN_CONV_ALGO_BWD_FILTER),
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
             in_c_key=False)
 AddConfigVar('dnn.conv.precision',

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/dnn_base.c
+++ b/theano/sandbox/cuda/dnn_base.c
@@ -54,8 +54,11 @@ c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
    return -1;
  }
  int dim = CudaNdarray_NDIM(var);
-  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor_v4(desc,
-                                                 CudaNdarray_HOST_DIMS(var));
+                                                    CUDNN_DATA_FLOAT,
+                                                    CUDNN_TENSOR_NCHW,
+                                                    dim,
+                                                    CudaNdarray_HOST_DIMS(var));
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s."

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -179,8 +179,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      int upscale[2];
      cudnnConvolutionMode_t mode;
      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                               upscale, &mode, &data_type);
+                                            upscale, &mode, &data_type);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -224,6 +224,19 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                                                  APPLY_SPECIFIC(output),
                                                  chosen_algo,
                                                  &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+      // TODO: Print a warning
+      chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+                                                    APPLY_SPECIFIC(input),
+                                                    APPLY_SPECIFIC(kerns),
+                                                    desc,
+                                                    APPLY_SPECIFIC(output),
+                                                    chosen_algo,
+                                                    &worksize);
+    }
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "GpuDnnConv: error getting worksize: %s",

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -178,8 +178,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      int upscale[2];
      cudnnConvolutionMode_t mode;
      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                               upscale, &mode, &data_type);
+                                            upscale, &mode, &data_type);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -237,7 +237,7 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
      return 1;
    // Perform the convolution
-    err = cudnnConvolutionBackwardData_v3(
+    err = cudnnConvolutionBackwardData(
      _handle,
      (void *)&alpha,
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -173,8 +173,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      int upscale[2];
      cudnnConvolutionMode_t mode;
      cudnnDataType_t data_type;
-      err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+      err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
-                                               upscale, &mode, &data_type);
+                                            upscale, &mode, &data_type);
      if (err != CUDNN_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
@@ -221,7 +221,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
      return 1;
    // Perform the convolution
-    err = cudnnConvolutionBackwardFilter_v3(
+    err = cudnnConvolutionBackwardFilter(
      _handle,
      (void *)&alpha,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),

--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -392,8 +392,9 @@ def test_pooling_with_tensor_vars():
 def test_old_pool_interface():
-    if not cuda.dnn.dnn_available():
+    if not cuda.dnn.dnn_available() or cuda.dnn.version() > (5000, 5000):
        raise SkipTest(cuda.dnn.dnn_available.msg)
    testfile_dir = os.path.dirname(os.path.realpath(__file__))
    fname = 'old_pool_interface.pkl'
    with open(os.path.join(testfile_dir, fname), 'rb') as fp:

--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
@@ -35,7 +35,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
    return -1;
  }
-  err = cudnnSetConvolutionNdDescriptor_v3(*desc, NB_DIMS, pad, strides,
+  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides,
-                                           upscale, CONV_MODE, PRECISION);
+                                        upscale, CONV_MODE, PRECISION);
  return 0;
 }
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -33,6 +33,8 @@ from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
 from .opt_util import alpha_merge, output_merge, inplace_allocempty
+from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
 def raise_no_cudnn(msg="CuDNN is required for convolution and pooling"):
    raise RuntimeError(msg)
@@ -232,6 +234,7 @@ def version(raises=True):
    :raises: If True, raise an exception if CuDNN is not present or badly installed.
        Otherwise, return -1.
    """
    if not dnn_present():
        if raises:
@@ -397,9 +400,9 @@ class GpuDnnConv(DnnBase):
    ----------
    image
    kernel
-    descr
+    descr :
        The convolution descriptor.
-    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'guess_once',
+    algo : {'small', 'none', 'large', 'fft', 'fft_tiling', 'winograd', 'guess_once',
            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
        Default is the value of :attr:`config.dnn.conv.algo_fwd`.
@@ -435,8 +438,12 @@ class GpuDnnConv(DnnBase):
                raise RuntimeError("CuDNN tiled-FFT convolution requires "
                                   "CuDNN v4 or more recent")
+        if version() < 5000 and self.algo == 'winograd':
+            raise RuntimeError("CuDNN winograd convolution requires "
+                               "CuDNN v5 or more recent")
        assert self.algo in ['none', 'small', 'large', 'fft', 'fft_tiling',
-                             'guess_once', 'guess_on_shape_change',
+                             'winograd', 'guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']
    def __setstate__(self, d):
@@ -468,6 +475,9 @@ class GpuDnnConv(DnnBase):
        elif self.algo == 'fft_tiling':
            # need v4
            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING'
+        elif self.algo == 'winograd':
+            # need v5
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD'
        defs.append(('CONV_ALGO', alg))
        if self.algo in ['guess_once', 'guess_on_shape_change',
@@ -565,8 +575,11 @@ class GpuDnnConvGradW(DnnBase):
    ----------
    image
    kernel
-    descr
+    descr :
        The convolution descriptor.
+    algo : {'none', 'deterministic', 'fft', 'small', 'guess_once',
+            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_filter`.
    """
@@ -582,9 +595,7 @@ class GpuDnnConvGradW(DnnBase):
            algo = config.dnn.conv.algo_bwd_filter
        self.algo = algo
-        assert self.algo in ['none', 'deterministic', 'fft', 'small',
+        assert self.algo in SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
-                             'guess_once', 'guess_on_shape_change',
-                             'time_once', 'time_on_shape_change']
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -688,6 +699,9 @@ class GpuDnnConvGradI(DnnBase):
    kernel
    descr
        The convolution descriptor.
+    algo : {'none', 'deterministic', 'fft', 'fft_tiling', 'winograd', 'guess_once',
+            'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+        Default is the value of :attr:`config.dnn.conv.algo_bwd_data`.
    """
@@ -708,9 +722,12 @@ class GpuDnnConvGradI(DnnBase):
        if version() < 4000 and self.algo == 'fft_tiling':
            raise RuntimeError("CuDNN's tiled-FFT convolution requires CuDNN "
                               "v4 or more recent")
+        if version() < 5000 and self.algo == 'winograd':
+            raise RuntimeError("CuDNN's winograd convolution requires CuDNN "
+                               "v5 or more recent")
        assert self.algo in ['none', 'deterministic', 'fft', 'fft_tiling',
-                             'guess_once', 'guess_on_shape_change',
+                             'winograd', 'guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']
    def __setstate__(self, d):
@@ -749,13 +766,16 @@ class GpuDnnConvGradI(DnnBase):
            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
            if self.algo == 'none':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
-            if self.algo == 'deterministic':
+            elif self.algo == 'deterministic':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
-            if self.algo == 'fft':
+            elif self.algo == 'fft':
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
-            if self.algo == 'fft_tiling':
+            elif self.algo == 'fft_tiling':
                # big workspace but less than fft
                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING'
+            elif self.algo == 'winograd':
+                # need v5
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD'
            if self.algo in ['guess_once', 'guess_on_shape_change',
                             'time_once', 'time_on_shape_change']:
@@ -1047,9 +1067,13 @@ class GpuDnnPoolDesc(Op):
  static const int win[%(nd)d] = {%(win)s};
  static const int pad[%(nd)d] = {%(pad)s};
  static const int str[%(nd)d] = {%(str)s};
-  err = cudnnSetPoolingNdDescriptor(
-    %(desc)s, %(mode_flag)s, %(nd)d,
+#if CUDNN_VERSION >= 5000
-    win, pad, str);
+    err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, CUDNN_PROPAGATE_NAN, %(nd)d, win, pad, str);
+#else
+    err = cudnnSetPoolingNdDescriptor(%(desc)s, %(mode_flag)s, %(nd)d, win, pad, str);
+#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
                 cudnnGetErrorString(err));
@@ -1062,7 +1086,7 @@ class GpuDnnPoolDesc(Op):
           str=', '.join(map(str, self.stride)))
    def c_code_cache_version(self):
-        return (3, version())
+        return (4, version())
 class GpuDnnPool(DnnBase):
@@ -1070,18 +1094,17 @@ class GpuDnnPool(DnnBase):
    """
    Parameters
    ----------
-    img
+    img : tensor
        The image 4d or 5d tensor.
-    Parameters
+    ws : tensor
-    ----------
-    ws : tensor variable
        Window size.
-    stride : tensor variable
+    stride : tensor
        (dx, dy) or (dx, dy, dz).
    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
        The old deprecated name 'average' corresponds to 'average_inc_pad'.
    pad : tensor
        (padX, padY) or (padX, padY, padZ)
    """
    __props__ = ('mode',)
@@ -1255,14 +1278,12 @@ class GpuDnnSoftmaxBase(DnnBase):
    Parameters
    ----------
-    algo
+    algo : {'fast', 'accurate', 'log'}
-        'fast', 'accurate' or 'log' indicating whether, respectively,
+        Indicating whether, respectively, computations should be optimized for
-        computations should be optimized for speed, for accuracy, or if CuDNN
+        speed, for accuracy, or if CuDNN should rather compute the log-softmax instead.
-        should rather compute the log-softmax instead.
+    mode : {'instance', 'channel'}
-    mode
+        Indicating whether the softmax should be computed per image across 'c01'
-        'instance' or 'channel' indicating whether the softmax should be
+        or per spatial location '01' per image across 'c'.
-        computed per image across 'c01' or per spatial location '01' per
-        image across 'c'.
    """
@@ -1306,14 +1327,12 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN Softmax.
-    algo
+    algo : {'fast', 'accurate', 'log'}
-        'fast', 'accurate' or 'log' indicating whether, respectively,
+        Indicating whether, respectively, computations should be optimized for
-        computations should be optimized for speed, for accuracy, or if CuDNN
+        speed, for accuracy, or if CuDNN should rather compute the log-softmax instead.
-        should rather compute the log-softmax instead.
+    mode : {'instance', 'channel'}
-    mode
+        Indicating whether the softmax should be computed per image across 'c01'
-        'instance' or 'channel' indicating whether the softmax should be
+        or per spatial location '01' per image across 'c'.
-        computed per image across 'c01' or per spatial location '01' per
-        image across 'c'.
    """
    direction = "forward"

--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
@@ -51,6 +51,8 @@ c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
 static int
 c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  cudnnDataType_t dt;
+  cudnnStatus_t err;
  if (!GpuArray_IS_C_CONTIGUOUS(&var->ga)) {
    PyErr_SetString(PyExc_ValueError,
 		    "Only contiguous filters (kernels) are supported.");
@@ -86,7 +88,12 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
    dims[i] = PyGpuArray_DIM(var, i);
  }
-  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
+#if CUDNN_VERSION >= 5000
+    err = cudnnSetFilterNdDescriptor(desc, dt, CUDNN_TENSOR_NCHW, nd, dims);
+#else
+    err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
+#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s.",

--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -92,12 +92,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    algo = choice.algo;
 #else
-    size_t free = 0, total = 0;
+    size_t free;
-    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != cudaSuccess) {
+    if (err2 != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "memory information on the GPU: %s\n",
+                   "memory information on the GPU");
-                   cudaGetErrorString(err2));
      cuda_exit(c->ctx);
      return 1;
    }
@@ -154,7 +154,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -193,6 +193,21 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                                                  APPLY_SPECIFIC(output),
                                                  algo,
                                                  &worksize);
+    if (err == CUDNN_STATUS_NOT_SUPPORTED) {
+      // Fallback to none algo if not supported
+      // TODO: Print a warning
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+                                                    APPLY_SPECIFIC(input),
+                                                    APPLY_SPECIFIC(kerns),
+                                                    desc,
+                                                    APPLY_SPECIFIC(output),
+                                                    algo,
+                                                    &worksize);
+    }
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
                   "error getting worksize: %s",

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -91,12 +91,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
-    size_t free = 0, total = 0;
+    size_t free;
-    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != cudaSuccess){
-      cudaGetLastError();
+    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+                   "memory information on the GPU");
      cuda_exit(c->ctx);
      return 1;
    }
@@ -146,7 +146,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -203,7 +203,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  err = cudnnConvolutionBackwardData_v3(
+  err = cudnnConvolutionBackwardData(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
@@ -92,12 +92,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
-    size_t free = 0, total = 0;
+    size_t free;
-    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
-    if (err2 != cudaSuccess){
-      cudaGetLastError();
+    if (err2 != GA_NO_ERROR) {
-      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
-                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+                   "memory information on the GPU");
      cuda_exit(c->ctx);
      return 1;
    }
@@ -146,7 +146,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    int upscale[2];
    cudnnConvolutionMode_t mode;
    cudnnDataType_t data_type;
-    err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
                                             upscale, &mode, &data_type);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -190,7 +190,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
-  err = cudnnConvolutionBackwardFilter_v3(
+  err = cudnnConvolutionBackwardFilter(
    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
@@ -24,7 +24,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
-                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));  
+                "(pool): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
@@ -38,7 +38,7 @@ if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC
 #section support_code_struct
 int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
-                             PyArrayObject *ws, 
+                             PyArrayObject *ws,
                             PyArrayObject *stride,
                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
@@ -69,7 +69,12 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  for(int i = 0; i < ndims; i++) {
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }
+#if CUDNN_VERSION >= 5000
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
+#else
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
+#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
@@ -42,7 +42,7 @@ APPLY_SPECIFIC(pool) = NULL;
  }
  if ((err = cudnnCreatePoolingDescriptor(&APPLY_SPECIFIC(pool))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_MemoryError, "could not allocate pooling descriptor"
-                "(pool): %s", cudnnGetErrorString(err));  
+                "(pool): %s", cudnnGetErrorString(err));
    FAIL;
  }
 }
@@ -60,7 +60,7 @@ if (APPLY_SPECIFIC(pool) != NULL) { cudnnDestroyPoolingDescriptor(APPLY_SPECIFIC
 int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyGpuArrayObject *out,
                                  PyGpuArrayObject *out_grad,
-                                  PyArrayObject *ws, 
+                                  PyArrayObject *ws,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
@@ -109,7 +109,12 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
  for(int i = 0; i < ndims; i++) {
     s[i] = *((npy_intp*)PyArray_GETPTR1(stride, i));
  }
+#if CUDNN_VERSION >= 5000
+  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, CUDNN_PROPAGATE_NAN, ndims, w, p, s);
+#else
  err = cudnnSetPoolingNdDescriptor(APPLY_SPECIFIC(pool), MODE_FLAG, ndims, w, p, s);
+#endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));

--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
 import logging
 from nose.plugins.skip import SkipTest
+from nose_parameterized import parameterized
 import numpy
-from itertools import product
+from itertools import product, chain
 import theano
 from six import StringIO
@@ -18,6 +19,8 @@ from ..basic_ops import GpuAllocEmpty
 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from . import test_nnet
+from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
 def test_dnn_conv_desc_merge():
    if not dnn.dnn_available(test_ctx_name):
@@ -393,6 +396,9 @@ def test_dnn_tag():
 class TestDnnInferShapes(utt.InferShapeTester):
+    border_modes = ['valid', 'full', 'half']
+    conv_modes = ['conv', 'cross']
    def setUp(self):
        super(TestDnnInferShapes, self).setUp()
        self.mode = mode_with_gpu
@@ -427,37 +433,25 @@ class TestDnnInferShapes(utt.InferShapeTester):
            dnn.GpuDnnSoftmaxGrad
        )
-    def test_conv(self):
+    def _test_conv(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsamples, algo):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
-        img = T.ftensor4('img')
-        kerns = T.ftensor4('kerns')
-        out = T.ftensor4('out')
-        img_val = numpy.asarray(
-            numpy.random.rand(7, 2, 6, 4),
-            dtype='float32'
-        )
-        kern_vals = numpy.asarray(
-            numpy.random.rand(8, 2, 4, 3),
-            dtype='float32'
-        )
-        for params in product(
+        img_val = numpy.asarray(img_val, dtype='float32')
-            ['valid', 'full', 'half'],
+        kern_vals = numpy.asarray(kern_vals, dtype='float32')
-            [(1, 1), (2, 2)],
-            ['conv', 'cross']
+        for subsample in subsamples:
-        ):
            out_vals = numpy.zeros(
                dnn.GpuDnnConv.get_out_shape(img_val.shape, kern_vals.shape,
-                                             border_mode=params[0],
+                                             border_mode=border_mode,
-                                             subsample=params[1]),
+                                             subsample=subsample),
                dtype='float32')
            desc = dnn.GpuDnnConvDesc(
-                border_mode=params[0],
+                border_mode=border_mode,
-                subsample=params[1],
+                subsample=subsample,
-                conv_mode=params[2]
+                conv_mode=conv_mode
            )(kerns.shape)
-            conv = dnn.GpuDnnConv()(img, kerns, out, desc)
+            conv = dnn.GpuDnnConv(algo=algo)(img, kerns, out, desc)
            self._compile_and_check(
                [img, kerns, out],
                [conv],
@@ -465,54 +459,92 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConv
            )
-    def test_conv_gradw(self):
+    @parameterized.expand(chain(product([SUPPORTED_DNN_CONV_ALGO_FWD[0]],
+                                        border_modes,
+                                        conv_modes),
+                                product(SUPPORTED_DNN_CONV_ALGO_FWD[1:],
+                                        [border_modes[0]],
+                                        [conv_modes[0]])),
+                          testcase_func_name=utt.custom_name_func)
+    def test_conv(self, algo, border_mode, conv_mode):
+        if algo == 'winograd' and dnn.version() < 5000:
+            raise SkipTest(dnn.dnn_available.msg)
+        self._test_conv(T.ftensor4('img'),
+                        T.ftensor4('kerns'),
+                        T.ftensor4('out'),
+                        numpy.random.rand(7, 2, 8, 4),
+                        numpy.random.rand(8, 2, 4, 3),
+                        border_mode,
+                        conv_mode,
+                        [(1, 1), (2, 2)],
+                        algo)
+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
+    def test_conv3d_none(self, border_mode, conv_mode):
+        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
+        self._test_conv(ftensor5('img'),
+                        ftensor5('kerns'),
+                        ftensor5('out'),
+                        numpy.random.rand(10, 2, 6, 4, 11),
+                        numpy.random.rand(8, 2, 4, 3, 1),
+                        border_mode,
+                        conv_mode,
+                        [(1, 1, 1), (2, 2, 2)],
+                        'none')
+    def _test_conv_gradw(self, img, kerns, out, img_val, kern_vals, border_mode, conv_mode, subsample):
        if not dnn.dnn_available(test_ctx_name):
            raise SkipTest(dnn.dnn_available.msg)
-        img = T.ftensor4('img')
-        kerns = T.ftensor4('kerns')
-        out = T.ftensor4('out')
        img_val = numpy.asarray(
-            numpy.random.rand(2, 5, 6, 8),
+            img_val,
            dtype='float32'
        )
        kern_vals = numpy.asarray(
-            numpy.random.rand(2, 1, 5, 6),
+            kern_vals,
            dtype='float32'
        )
-        for params in product(
+        temp_img = img.dimshuffle(1, 0, 2, 3)
-            ['valid', 'full', 'half'],
+        temp_kerns = kerns
-            [(1, 1)],  # strides besides (1, 1)
+        if conv_mode == 'conv':
-            ['conv', 'cross']
+            temp_kerns = temp_kerns[:, :, ::-1, ::-1]
-        ):
+        temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
-            temp_img = img.dimshuffle(1, 0, 2, 3)
+        shape = (
-            temp_kerns = kerns
+            kern_vals.shape[1], img_val.shape[1],
-            if params[2] == 'conv':
+            img_val.shape[2] - kern_vals.shape[2] + 1,
-                temp_kerns = temp_kerns[:, :, ::-1, ::-1]
+            img_val.shape[3] - kern_vals.shape[3] + 1
-            temp_kerns = temp_kerns.dimshuffle(1, 0, 2, 3)
+        )
-            shape = (
+        out_vals = numpy.zeros(shape, dtype='float32')
-                kern_vals.shape[1], img_val.shape[1],
+        desc = dnn.GpuDnnConvDesc(
-                img_val.shape[2] - kern_vals.shape[2] + 1,
+            border_mode=border_mode,
-                img_val.shape[3] - kern_vals.shape[3] + 1
+            subsample=subsample,
-            )
+            conv_mode=conv_mode
-            out_vals = numpy.zeros(shape, dtype='float32')
+        )(out.shape)
-            desc = dnn.GpuDnnConvDesc(
+        conv_grad_w = dnn.GpuDnnConvGradW()(
-                border_mode=params[0],
+            temp_img,
-                subsample=params[1],
+            temp_kerns,
-                conv_mode=params[2]
+            out,
-            )(out.shape)
+            desc,
-            conv_grad_w = dnn.GpuDnnConvGradW()(
+        )
-                temp_img,
+        self._compile_and_check(
-                temp_kerns,
+            [temp_img, temp_kerns, out],
-                out,
+            [conv_grad_w],
-                desc,
+            [img_val, kern_vals, out_vals],
-            )
+            dnn.GpuDnnConvGradW
-            self._compile_and_check(
+        )
-                [temp_img, temp_kerns, out],
-                [conv_grad_w],
+    @parameterized.expand(product(border_modes, conv_modes), utt.custom_name_func)
-                [img_val, kern_vals, out_vals],
+    def test_conv_gradw(self, border_mode, conv_mode):
-                dnn.GpuDnnConvGradW
+        self._test_conv_gradw(T.ftensor4('img'),
-            )
+                              T.ftensor4('kerns'),
+                              T.ftensor4('out'),
+                              numpy.random.rand(2, 5, 6, 8),
+                              numpy.random.rand(2, 1, 5, 6),
+                              border_mode,
+                              conv_mode,
+                              (1, 1))
    def test_conv_gradi(self):
        if not dnn.dnn_available(test_ctx_name):

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -4,6 +4,7 @@ from functools import wraps
 import logging
 import sys
 import unittest
+from nose_parameterized import parameterized
 from six import integer_types
 from six.moves import StringIO
@@ -31,6 +32,13 @@ except ImportError:
 _logger = logging.getLogger("theano.tests.unittest_tools")
+def custom_name_func(testcase_func, param_num, param):
+    return "%s_%s" % (
+        testcase_func.__name__,
+        parameterized.to_safe_name("_".join(str(x) for x in param.args)),
+    )
 def fetch_seed(pseed=None):
    """
    Returns the seed to use for running the unit tests.
@@ -96,6 +104,7 @@ verify_grad.E_grad = T.verify_grad.E_grad
 class TestOptimizationMixin(object):
    def assertFunctionContains(self, f, op, min=1, max=sys.maxsize):
        toposort = f.maker.fgraph.toposort()
        matches = [node for node in toposort if node.op == op]
@@ -172,6 +181,7 @@ class T_OpContractMixin(object):
 class InferShapeTester(unittest.TestCase):
    def setUp(self):
        seed_rng()
        # Take into account any mode that may be defined in a child class
@@ -311,6 +321,7 @@ def str_diagnostic(expected, value, rtol, atol):
 class WrongValue(Exception):
    def __init__(self, expected_val, val, rtol, atol):
        Exception.__init__(self)  # to be compatible with python2.4
        self.val1 = expected_val