Merge pull request #3245 from carriepl/v3

CuDNN v3

Merge pull request #3245 from carriepl/v3
8cb9d50e · Frédéric Bastien · 2b83b6ac · 4d4be316 · 8cb9d50e · 8cb9d50e
--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -36,12 +36,61 @@ To get an error if Theano can not use cuDNN, use this Theano flag:

 .. note::

-   CuDNN v2 is now released, if you used any v2 release candidate, we
-   strongly suggest that you update it to the final version. From now
-   on, we only support the final release.
+   CuDNN v3 has now been released. CuDNN v2 remains supported but CuDNN v3 is
+   faster and offers many more options. We recommend that everybody update to
+   v3.
+
+.. note::
+
+   Starting in CuDNN v3, multiple convolution implementations are offered and
+   it is possible to use heuristics to automatically choose a convolution
+   implementation well suited to the parameters of the convolution.
+
+   The Theano flag ``dnn.conv.algo_fwd`` allows to specify the CuDNN
+   convolution implementation that Theano should use for forward convolutions.
+   Possible values include :
+
+   * ``small`` (default) : use a convolution implementation with small memory
+     usage
+   * ``none`` : use a slower implementation with minimal memory usage
+   * ``large`` : use a faster implementation with large memory usage
+   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+     (very high memory usage)
+   * ``guess_once`` : the first time a convolution is executed, the
+     implementation to use is chosen according to CuDNN's heuristics and reused
+     for every subsequent execution of the convolution.
+   * ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+   * ``time_once`` : the first time a convolution is executed, every convolution
+     implementation offered by CuDNN is executed and timed. The fastest is
+     reused for every subsequent execution of the convolution.
+   * ``time_on_shape_change`` : like ``time_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+
+   The Theano flag ``dnn.conv.algo_bwd`` allows to specify the CuDNN
+   convolution implementation that Theano should use for gradient convolutions.
+   Possible values include :
+
+   * ``none`` (default) : use the default non-deterministic convolution
+     implementation
+   * ``deterministic`` : use a slower but deterministic implementation
+   * ``fft`` : use the Fast Fourrier Transform implementation of convolution
+     (very high memory usage)
+   * ``guess_once`` : the first time a convolution is executed, the
+     implementation to use is chosen according to CuDNN's heuristics and reused
+     for every subsequent execution of the convolution.
+   * ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.
+   * ``time_once`` : the first time a convolution is executed, every convolution
+     implementation offered by CuDNN is executed and timed. The fastest is
+     reused for every subsequent execution of the convolution.
+   * ``time_on_shape_change`` : like ``time_once`` but a new convolution
+     implementation selected every time the shapes of the inputs and kernels
+     don't match the shapes from the last execution.

-   CuDNN v2 is much faster than v1. We recommend that everybody
-   updates to v2.

 .. note::

@@ -51,13 +100,16 @@ To get an error if Theano can not use cuDNN, use this Theano flag:

 .. note::

-    The documentation of CUDNN R1 and R2 tells that, for the following
-    2 operations, the reproducibility is not guaranteed:
+    The documentation of CUDNN tells that, for the 2 following operations, the
+    reproducibility is not guaranteed with the default implementation:
    `cudnnConvolutionBackwardFilter` and `cudnnConvolutionBackwardData`.
    Those correspond to the gradient wrt the weights and the gradient wrt the
-    input of the convolution. They are also used sometimes in the forward 
+    input of the convolution. They are also used sometimes in the forward
    pass, when they give a speed up.

+    The Theano flag ``dnn.conv.algo_bwd`` can be use to force the use of a
+    slower but deterministic convolution implementation.
+
 .. note::

    There is a problem we do not understand yet when cudnn paths are
@@ -79,7 +131,8 @@ Convolution Ops
 ===============

 .. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI
+    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConv3d, GpuDnnConvGradW,
+              GpuDnnConv3dGradW, GpuDnnConvGradI, GpuDnnConv3dGradI

 Pooling Ops
 ===========

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -41,6 +41,20 @@ static inline const char *cudnnGetErrorString(cudnnStatus_t err) {

 typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;

+static inline cudnnStatus_t
+cudnnSetTensorNdDescriptor(
+  cudnnTensorDescriptor_t tensorDesc,
+  cudnnDataType_t dataType,
+  int nbDims,
+  const int dimA[],
+  const int strideA[]) {
+  if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnSetTensor4dDescriptorEx(
+    tensorDesc, dataType,
+    dimA[0], dimA[1], dimA[2], dimA[3],
+    strideA[0], strideA[1], strideA[2], strideA[3]);
+}
+
 static inline cudnnStatus_t
 cudnnGetConvolution2dForwardOutputDim(
  const cudnnConvolutionDescriptor_t convDesc,
@@ -183,6 +197,85 @@ cudnnConvolutionBackwardData_v2(

 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2

+static inline cudnnStatus_t
+cudnnSetPoolingNdDescriptor(
+  cudnnPoolingDescriptor_t poolingDesc,
+  const cudnnPoolingMode_t mode,
+  int nbDims,
+  const int windowDimA[],
+  const int paddingA[],
+  const in strideA[]) {
+  if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnSetPoolingDescriptor(poolingDesc, mode,
+                                   windowDimA[0], windowDimA[1],
+                                   strideA[0], strideA[1]);
+}
+
+static inline cudnnStatus_t
+cudnnGetPoolingNdDescriptor(
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const int nbDimsRequested,
+  cudnnPoolingMode_t *mode,
+  int *nbDims,
+  int windowA[],
+  int paddingA[],
+  int strideA[]) {
+  int win0, win1, str0, str1;
+  cudnnStatus_t err;
+  if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
+                                  &str0, &str1);
+  if (err != CUDNN_STATUS_SUCCESS) return err;
+  *nbDims = 2;
+  paddingA[0] = 0;
+  paddingA[1] = 0;
+  windowA[0] = win0;
+  windowA[1] = win1;
+  strideA[0] = str0;
+  strideA[1] = str1;
+  return CUDNN_STATUS_SUCCESS;
+}
+
+static inline cudnnStatus_t
+cudnnPoolingForward_v2(
+  cudnnHandle_t handle,
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDesc,
+  void *destData) {
+  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
+                             destDesc, destData);
+}
+#define cudnnPoolingForward cudnnPoolingForward_v2
+
+static inline cudnnStatus_t
+cudnnPoolingBackward_v2(
+  cudnnHandle_t handle,
+  const cudnnPoolingDescriptor_t poolingDesc,
+  const void *alpha,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
+  const cudnnTensorDescriptor_t srcDiffDesc,
+  const void *srcDiffData,
+  const cudnnTensorDescriptor_t destDesc,
+  const void *destData,
+  const void *beta,
+  const cudnnTensorDescriptor_t destDiffDesc,
+  void *destDiffData) {
+  if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
+  return cudnnPoolingBackward(handle, poolingDesc,
+                              srcDesc, srcData,
+                              srcDiffDesc, srcDiffData,
+                              destDesc, destData,
+                              destDiffDesc, destDiffData);
+}
+#define cudnnPoolingBackward cudnnPoolingBackward_v2
+
 //Needed for R2 rc2
 # define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
 #else

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
 import os
 import numpy
+import warnings

 import theano
-from theano import Apply, gof, tensor, config, Variable
-from theano.scalar import as_scalar, constant
+from theano import Apply, tensor, config, Variable
+from theano.scalar import as_scalar, constant, Log
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.type import CDataType, Generic
@@ -17,7 +18,8 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
-                                           gpu_alloc_empty, GpuAllocEmpty)
+                                           gpu_alloc_empty, GpuAllocEmpty,
+                                           GpuElemwise)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -236,10 +238,10 @@ class GpuDnnConvDesc(GpuOp):

    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
        if isinstance(border_mode, int):
-            border_mode = (border_mode, border_mode)
+            border_mode = (border_mode,) * len(subsample)
        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
+            assert len(border_mode) == len(subsample)
+            border_mode = tuple(map(int, border_mode))
        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
                border_mode in ('valid', 'full')):
            raise ValueError(
@@ -247,7 +249,7 @@ class GpuDnnConvDesc(GpuOp):
                '"valid", "full", an integer or a pair of'
                ' integers'.format(border_mode))
        self.border_mode = border_mode
-        assert len(subsample) == 2
+        assert len(subsample) in [2, 3]
        self.subsample = subsample
        assert conv_mode in ('conv', 'cross')
        self.conv_mode = conv_mode
@@ -266,12 +268,14 @@ class GpuDnnConvDesc(GpuOp):
        img_shape, kern_shape = inputs
        desc, = outputs

+        nb_dim = len(self.subsample)
+
        if isinstance(self.border_mode, tuple):
-            pad_h_spec, pad_w_spec = map(int, self.border_mode)
-            assert pad_h_spec >= 0 and pad_w_spec >= 0
+            pad_desc = map(int, self.border_mode)
+            assert min(pad_desc) >= 0
            bmode = 2
        else:
-            pad_h_spec = pad_w_spec = 0
+            pad_desc = [0] * nb_dim

            if self.border_mode == "valid":
                bmode = 1
@@ -284,11 +288,13 @@ class GpuDnnConvDesc(GpuOp):
        else:
            conv_flag = 'CUDNN_CROSS_CORRELATION'

+        pad_str = ", ".join([str(s) for s in pad_desc])
+        subsample_str = ", ".join([str(s) for s in self.subsample])
+        upscale_str = ", ".join(["1"] * nb_dim)
+
        return """
 {
  cudnnStatus_t err;
-  int pad_h%(name)s;
-  int pad_w%(name)s;

  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
@@ -296,42 +302,29 @@ class GpuDnnConvDesc(GpuOp):
    %(fail)s
  }

-  if (%(bmode)d == 2) {
-    pad_h%(name)s = %(pad_h_spec)d;
-    pad_w%(name)s = %(pad_w_spec)d;
-  } else if (%(bmode)d == 1) {
-    pad_h%(name)s = 0;
-    pad_w%(name)s = 0;
-  } else if (%(bmode)d == 0) {
-    pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
-    pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
-  } else {
-    PyErr_SetString(PyExc_ValueError, "bad border mode");
-    %(fail)s
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 30
+
+  int pad[%(nb_dim)d] = {%(pad_str)s};
+  int subsample[%(nb_dim)d] = {%(subsample_str)s};
+  int upscale[%(nb_dim)d] = {%(upscale_str)s};
+
+  // Adjust padding values if using full convolution
+  if (%(bmode)d == 0) {
+    pad[0] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
+    pad[1] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
+    if (%(nb_dim)d >= 3) {
+        pad[2] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 4) - 1;
+    }
  }
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
-  err = cudnnSetConvolution2dDescriptor(
+
+  err = cudnnSetConvolutionNdDescriptor(
  %(desc)s,
-  pad_h%(name)s,
-  pad_w%(name)s,
-  %(subsx)d, %(subsy)d, 1, 1,
+  %(nb_dim)d,
+  pad, subsample, upscale,
  %(conv_flag)s
  );
 #else
-  err = cudnnSetConvolutionDescriptorEx(
-  %(desc)s,
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
-  pad_h%(name)s,
-  pad_w%(name)s,
-  %(subsx)d, %(subsy)d, 1, 1,
-  %(conv_flag)s
-  );
+  PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
 #endif
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
@@ -341,21 +334,43 @@ class GpuDnnConvDesc(GpuOp):
 }
 """ % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
-           subsx=self.subsample[0], subsy=self.subsample[1],
-           pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
+           pad_str=pad_str, subsample_str=subsample_str,
+           upscale_str=upscale_str, nb_dim=nb_dim)

    def c_code_cache_version(self):
        return (2, version())


 AddConfigVar('dnn.conv.workmem',
-             "Default value for the workmem attribute of cudnn convolutions.",
-             EnumStr('small', 'none', 'large'),
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
             in_c_key=False)

+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+
 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float32'))
 _one = constant(numpy.asarray(1.0, dtype='float32'))
+_ifour = constant(numpy.asarray(4, dtype='int32'))
+_ifive = constant(numpy.asarray(5, dtype='int32'))


 def ensure_float(val, default, name):
@@ -372,6 +387,20 @@ def ensure_float(val, default, name):
    return val


+def ensure_int(val, default, name):
+    if val is None:
+        return default.clone()
+    if not isinstance(val, Variable):
+        val = constant(val)
+    if hasattr(val, 'ndim') and val.ndim == 0:
+        val = as_scalar(val)
+    if not isinstance(val.type, theano.scalar.Scalar):
+        raise TypeError("%s: expected a scalar value" % (name,))
+    if not val.type.dtype == 'int32':
+        raise TypeError("%s: type is not int32" % (name,))
+    return val
+
+
 class GpuDnnConv(DnnBase, COp):
    """
    The forward convolution.
@@ -380,29 +409,59 @@ class GpuDnnConv(DnnBase, COp):
    :param kernel:
    :param descr: the convolution descriptor
    """
-    __props__ = ('workmem', 'inplace')
+    __props__ = ('algo', 'inplace')
    __input_name__ = ('image', 'kernel', 'output',
                      'descriptor', 'alpha', 'beta')

-    def __init__(self, workmem=None, inplace=False):
+    def __init__(self, workmem=None, inplace=False, algo=None):
        """
-        :param workmem: either 'none', 'small' or 'large'.  Default is
-        the value of :attr:`config.dnn.conv.workmem`.
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'small', 'none', 'large', 'fft', 'guess_once',
+        'guess_on_shape_change', 'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_fwd`.
        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
                     "APPLY_SPECIFIC(conv_fwd)")
-        if workmem is None:
-            workmem = config.dnn.conv.workmem
-        self.workmem = workmem
+
+        if workmem is not None:
+            warnings.warn(("GpuDnnConv: parameter 'workmem' is deprecated. "
+                           "Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            self.algo = workmem
+        else:
+            if algo is None:
+                algo = config.dnn.conv.algo_fwd
+            self.algo = algo
+
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'small', 'large']
+
+        # In CuDNN version older than V3, the FFT implementation and the
+        # option to time the different implementations to get the fastest
+        # are both unavailable.
+        if version() < (3000, 3000):
+            if self.algo == 'fft':
+                raise RuntimeError("CuDNN FFT convolution requires CuDNN v3")
+            elif self.algo in ['guess_once', 'guess_on_shape_change']:
+                raise RuntimeError("CuDNN selection of convolution "
+                                   "implementation based on heuristics "
+                                   "requires CuDNN v3")
+            elif self.algo in ['time_once', 'time_on_shape_change']:
+                raise RuntimeError("CuDNN convolution timing requires CuDNN "
+                                   "v3")
+
+        assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
-        if not hasattr(self, 'workmem'):
-            self.workmem = 'none'
+        if not hasattr(self, 'algo'):
+            if hasattr(self, 'workmem'):
+                self.algo = self.workmem
+            else:
+                self.algo = config.dnn.conv.algo_fwd
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -411,17 +470,44 @@ class GpuDnnConv(DnnBase, COp):
            inpl_def = [('CONV_INPLACE', '1')]
        else:
            inpl_def = []
+
+        choose_alg = '0'
+        choose_alg_once = '0'
+        choose_alg_time = '0'
        if version() == -1:
-            alg_def = ('CONV_ALGO', "0")
+            alg = "0"
        else:
-            if self.workmem == 'none':
+            if self.algo == 'none':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
-            elif self.workmem == 'small':
+            elif self.algo == 'small':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
-            elif self.workmem == 'large':
+            elif self.algo == 'large':
                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
-            alg_def = ('CONV_ALGO', alg)
-        return [alg_def] + inpl_def
+            elif self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+            elif self.algo in ['guess_once', 'guess_on_shape_change']:
+                # The convolution implementation should be choosen according
+                # to a heuristic
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+                choose_alg = '1'
+                if self.algo == 'guess_once':
+                    choose_alg_once = '1'
+            elif self.algo in ['time_once', 'time_on_shape_change']:
+                # The convolution implementation should be choosen by timing
+                # every available implementation
+                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+                choose_alg = '1'
+                choose_alg_time = '1'
+                if self.algo == 'time_once':
+                    choose_alg_once = '1'
+
+        alg_def = ('CONV_ALGO', alg)
+        alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
+        alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
+
+        return [alg_def, alg_choose_def, alg_choose_once_def,
+                alg_choose_time_def] + inpl_def

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)
@@ -450,8 +536,10 @@ class GpuDnnConv(DnnBase, COp):

        top = gpu_contiguous(top)

-        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
-        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape),
+                                  desc)
+        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape),
+                                    desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -497,6 +585,108 @@ class GpuDnnConv(DnnBase, COp):
        return [shape[2]]


+class GpuDnnConv3d(GpuDnnConv):
+    """
+    The forward convolution.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+    """
+    __props__ = ('algo', 'inplace')
+    __input_name__ = ('image', 'kernel', 'output',
+                      'descriptor', 'alpha', 'beta')
+
+    def __init__(self, workmem=None, inplace=False, algo=None):
+        """
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'none', 'guess_once', 'guess_on_shape_change',
+        'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_fwd.
+        """
+        if workmem is not None:
+            warnings.warn(("GpuDnnConv3d: parameter 'workmem' is deprecated. "
+                           "Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            algo = workmem
+
+        super(GpuDnnConv3d, self).__init__(inplace=inplace, algo='none')
+        assert self.algo in ['none', 'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']
+
+    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
+
+        img = as_cuda_ndarray_variable(img)
+        kern = as_cuda_ndarray_variable(kern)
+        output = as_cuda_ndarray_variable(output)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if output.type.ndim != 5:
+            raise TypeError('output must be a 5D tensor')
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+
+        return Apply(self, [img, kern, output, desc, alpha, beta],
+                     [output.type()])
+
+    def grad(self, inp, grads):
+        img, kerns, output, desc, alpha, beta = inp
+        top, = grads
+
+        top = gpu_contiguous(top)
+
+        d_img = GpuDnnConv3dGradI()(kerns, top, gpu_alloc_empty(*img.shape),
+                                    desc)
+        d_kerns = GpuDnnConv3dGradW()(img, top, gpu_alloc_empty(*kerns.shape),
+                                      desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return [d_img * alpha, d_kerns * alpha, top * beta,
+                DisconnectedType()(), d_alpha, d_beta]
+
+    @staticmethod
+    def get_out_shape(ishape, kshape, border_mode, subsample):
+        """
+        This function computes the output shape for a convolution with
+        the specified parameters.  `ishape` and `kshape` can be symbolic
+        or scalar.
+        """
+        b = ishape[0]  # Number of inputs
+        d = ishape[2]  # Depth of input feature maps
+        h = ishape[3]  # Height of input feature maps
+        w = ishape[4]  # Width of input feature maps
+        nb = kshape[0]  # Number of output feature maps
+        kd = kshape[2]  # Depth of each filter
+        kh = kshape[3]  # Height of each filter
+        kw = kshape[4]  # Width of each filter
+
+        sd, sh, sw = subsample
+        if border_mode == 'full':
+            padd = kd - 1
+            padh = kh - 1
+            padw = kw - 1
+        elif isinstance(border_mode, tuple):
+            padd, padh, padw = border_mode
+        else:
+            assert border_mode == 'valid'
+            padd = 0
+            padh = 0
+            padw = 0
+
+        return (
+            b, nb,
+            (d + 2*padd - kd)//sd + 1,
+            (h + 2*padh - kh)//sh + 1,
+            (w + 2*padw - kw)//sw + 1
+        )
+
+
 class GpuDnnConvGradW(DnnBase, COp):
    """
    The convolution gradient with respect to the weights.
@@ -506,18 +696,43 @@ class GpuDnnConvGradW(DnnBase, COp):
    :param descr: the convolution descriptor

    """
-    __props__ = ('inplace',)
+    __props__ = ('algo', 'inplace',)
    __input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')

-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, workmem=None, algo=None):
+        """
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'none', 'deterministic', 'fft', 'guess_once',
+        'guess_on_shape_change', 'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
                     "APPLY_SPECIFIC(conv_gw)")
+
+        if workmem is not None:
+            warnings.warn(("GpuDnnConvGradW: parameter 'workmem' is "
+                           "deprecated. Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            self.algo = workmem
+        else:
+            if algo is None:
+                algo = config.dnn.conv.algo_bwd
+            self.algo = algo
+
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
+        if not hasattr(self, 'algo'):
+            if hasattr(self, 'workmem'):
+                self.algo = self.workmem
+            else:
+                self.algo = config.dnn.conv.algo_bwd
        if not hasattr(self, 'inplace'):
            self.inplace = False

@@ -527,7 +742,8 @@ class GpuDnnConvGradW(DnnBase, COp):

        kerns = gpu_contiguous(kerns)

-        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape),
+                                  desc)
        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -541,9 +757,46 @@ class GpuDnnConvGradW(DnnBase, COp):

    def get_op_params(self):
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            inplace_def = [('CONV_INPLACE', '1')]
        else:
-            return []
+            inplace_def = []
+
+        choose_alg = '0'
+        choose_alg_once = '0'
+        choose_alg_time = '0'
+
+        if version() == -1 or version() < (3000, 3000):
+            alg = "0"
+        else:
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+            elif self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
+            elif self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
+            elif self.algo in ['guess_once', 'guess_on_shape_change']:
+                # The convolution implementation should be chosen according
+                # to a heuristic
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+                choose_alg = '1'
+                if self.algo == 'guess_once':
+                    choose_alg_once = '1'
+            elif self.algo in ['time_once', 'guess_on_shape_change']:
+                # The convolution implementation should be chosen according
+                # to timing
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+                choose_alg = '1'
+                choose_alg_time = '1'
+                if self.algo == 'time_once':
+                    choose_alg_once = '1'
+
+        alg_def = ('CONV_ALGO', alg)
+        alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
+        alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
+
+        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def,
+                              alg_choose_time_def]

    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
        img = as_cuda_ndarray_variable(img)
@@ -570,6 +823,73 @@ class GpuDnnConvGradW(DnnBase, COp):
        return [shape[2]]


+class GpuDnnConv3dGradW(GpuDnnConvGradW):
+    """
+    The convolution gradient with respect to the weights.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+    __props__ = ('algo', 'inplace',)
+    __input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
+
+    def __init__(self, inplace=False, workmem=None, algo=None):
+        """
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'none', 'guess_once', 'guess_on_shape_change',
+        'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd.
+        """
+        if workmem is not None:
+            warnings.warn(("GpuDnnConv3dGradW: parameter 'workmem' is "
+                           "deprecated. Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            algo = workmem
+
+        super(GpuDnnConv3dGradW, self).__init__(inplace=inplace,
+                                                algo='none')
+        assert self.algo in ['none', 'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']
+
+    def grad(self, inp, grads):
+        img, top, output, desc, alpha, beta = inp
+        kerns, = grads
+
+        kerns = gpu_contiguous(kerns)
+
+        d_img = GpuDnnConv3dGradI()(kerns, top, gpu_alloc_empty(*img.shape),
+                                    desc)
+        d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return (d_img * alpha, d_top * alpha, kerns * beta,
+                DisconnectedType()(), d_alpha, d_beta)
+
+    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
+        img = as_cuda_ndarray_variable(img)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        output = as_cuda_ndarray_variable(output)
+        if img.type.ndim != 5:
+            raise TypeError('img must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if output.type.ndim != 5:
+            raise TypeError('output must be 5D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+
+        return Apply(self, [img, topgrad, output, desc, alpha, beta],
+                     [output.type()])
+
+
 class GpuDnnConvGradI(DnnBase, COp):
    """
    The convolution gradient with respect to the inputs.
@@ -579,16 +899,46 @@ class GpuDnnConvGradI(DnnBase, COp):
    :param descr: the convolution descriptor

    """
-    __props__ = ('inplace',)
-    __input_name__ = ('kernel', 'grad', 'output',
-                      'descriptor', 'alpha', 'beta')
+    __props__ = ('algo', 'inplace',)
+    __input_name__ = ('kernel', 'grad', 'output', 'descriptor', 'alpha',
+                      'beta')

-    def __init__(self, inplace=False):
+    def __init__(self, inplace=False, workmem=None, algo=None):
+        """
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'none', 'deterministic', 'fft', 'guess_once',
+        'guess_on_shape_change', 'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd`.
+        """
        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
                     "APPLY_SPECIFIC(conv_gi)")
+
+        if workmem is not None:
+            warnings.warn(("GpuDnnConvGradI: parameter 'workmem' is "
+                           "deprecated. Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            self.algo = workmem
+        else:
+            if algo is None:
+                algo = config.dnn.conv.algo_bwd
+            self.algo = algo
+
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'algo'):
+            if hasattr(self, 'workmem'):
+                self.algo = self.workmem
+            else:
+                self.algo = config.dnn.conv.algo_bwd
+        if not hasattr(self, 'inplace'):
+            self.inplace = False

    def grad(self, inp, grads):
        kerns, top, output, desc, alpha, beta = inp
@@ -596,7 +946,8 @@ class GpuDnnConvGradI(DnnBase, COp):

        img = gpu_contiguous(img)

-        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
+        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape),
+                                    desc)
        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -610,9 +961,46 @@ class GpuDnnConvGradI(DnnBase, COp):

    def get_op_params(self):
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            inplace_def = [('CONV_INPLACE', '1')]
        else:
-            return []
+            inplace_def = []
+
+        choose_alg = '0'
+        choose_alg_once = '0'
+        choose_alg_time = '0'
+
+        if version() == -1 or version() < (3000, 3000):
+            alg = "0"
+        else:
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+            elif self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
+            elif self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+            elif self.algo in ['guess_once', 'guess_on_shape_change']:
+                # The convolution implementation should be chosen according
+                # to a heuristic
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+                choose_alg = '1'
+                if self.algo == 'guess_once':
+                    choose_alg_once = '1'
+            elif self.algo in ['time_once', 'guess_on_shape_change']:
+                # The convolution implementation should be chosen according
+                # to timing
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+                choose_alg = '1'
+                choose_alg_time = '1'
+                if self.algo == 'time_once':
+                    choose_alg_once = '1'
+
+        alg_def = ('CONV_ALGO', alg)
+        alg_choose_def = ('CHOOSE_ALGO', choose_alg)
+        alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
+        alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
+
+        return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def,
+                              alg_choose_time_def]

    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_cuda_ndarray_variable(kern)
@@ -639,8 +1027,76 @@ class GpuDnnConvGradI(DnnBase, COp):
        return [shape[2]]


+class GpuDnnConv3dGradI(GpuDnnConvGradI):
+    """
+    The convolution gradient with respect to the inputs.
+
+    :param image:
+    :param kernel:
+    :param descr: the convolution descriptor
+
+    """
+    __props__ = ('algo', 'inplace',)
+    __input_name__ = ('kernel', 'grad', 'output', 'descriptor', 'alpha',
+                      'beta')
+
+    def __init__(self, inplace=False, workmem=None, algo=None):
+        """
+        :param workmem: *deprecated*, use param algo instead
+        :param algo: either 'none', 'guess_once', 'guess_on_shape_change',
+        'time_once' or 'time_on_shape_change'.
+        Default is the value of :attr:`config.dnn.conv.algo_bwd.
+        """
+        if workmem is not None:
+            warnings.warn(("GpuDnnConv3dGradI: parameter 'workmem' is "
+                           "deprecated. Use 'algo' instead."), stacklevel=3)
+            assert algo is None
+            algo = workmem
+
+        super(GpuDnnConv3dGradI, self).__init__(inplace=inplace,
+                                                algo="none")
+        assert self.algo in ['none', 'guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']
+
+    def grad(self, inp, grads):
+        kerns, top, output, desc, alpha, beta = inp
+        img, = grads
+
+        img = gpu_contiguous(img)
+
+        d_kerns = GpuDnnConv3dGradW()(img, top, gpu_alloc_empty(*kerns.shape),
+                                      desc)
+        d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
+        d_alpha = grad_not_implemented(self, 4, alpha)
+        d_beta = grad_not_implemented(self, 5, beta)
+
+        return (d_kerns * alpha, d_top * alpha, img * beta,
+                DisconnectedType()(), d_alpha, d_beta)
+
+    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
+        kern = as_cuda_ndarray_variable(kern)
+        topgrad = as_cuda_ndarray_variable(topgrad)
+        output = as_cuda_ndarray_variable(output)
+        if kern.type.ndim != 5:
+            raise TypeError('kern must be 5D tensor')
+        if topgrad.type.ndim != 5:
+            raise TypeError('topgrad must be 5D tensor')
+        if output.type.ndim != 5:
+            raise TypeError('output must be 5D tensor')
+
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+            raise TypeError('desc must be cudnnConvolutionDescriptor_t')
+
+        alpha = ensure_float(alpha, _one, 'alpha')
+        beta = ensure_float(beta, _zero, 'beta')
+
+        return Apply(self, [kern, topgrad, output, desc, alpha, beta],
+                     [output.type()])
+
+
 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
-             conv_mode='conv', direction_hint=None, workmem=None):
+             conv_mode='conv', direction_hint=None, workmem=None, algo=None):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -662,19 +1118,31 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        *not* 'forward!', it will use GpuDnnConvGradI.
        This parameter is used internally by graph optimizers and may be
        removed at any time without a deprecation period. You have been warned.
-    :param workmem: Specify the amount of working memory allowed.
-      More memory is usually faster.  One of 'none', 'small' or
-      'large'.  (default is None which takes its value from
-      :attr:`config.dnn.conv.workmem`)
-
+    :param workmem: *deprecated*, use param algo instead
+    :param algo: convolution implementation to use. One of 'none', 'small',
+        'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once' or
+        'time_on_shape_change'. Some of these values may require certain
+        versions of CuDNN to be installed. Default is the value of
+        :attr:`config.dnn.conv.algo_fwd.

    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
      work with this Op.
    """
+
+    # Check if deprecated param 'workmem' is used
+    if workmem is not None:
+        warnings.warn(("dnn_conv: parameter 'workmem' is deprecated. Use "
+                       "'algo' instead."), stacklevel=3)
+        assert algo is None
+        algo = workmem
+
+    # Ensure the value of direction_hint is supported
+    assert direction_hint in [None, 'bprop weights', 'forward']
+
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    if (border_mode == 'valid' and subsample == (1, 1) and
-        direction_hint == 'bprop weights'):
+            direction_hint == 'bprop weights'):
        # Special case: We are asked to use GpuDnnConvGradW. We need to set
        # up a suitable 'fake' convolution to compute the gradient for.
        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
@@ -686,24 +1154,25 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
-                        shape_i(img, 1, fgraph), shape2, shape3)
+                              shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross')(img.shape, out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))

    elif (border_mode == 'full' and subsample == (1, 1) and
-          direction_hint != 'forward!'):
-        # Special case: We can be faster by using GpuDnnConvGradI to compute
-        # the full convolution as the backward pass of a valid convolution.
-        # We just need to set up a suitable 'fake' valid convolution.
+            direction_hint != 'forward!' and version() == -1):
+        # Special case: In CuDNN v1, we can be faster by using GpuDnnConvGradI
+        # to compute the full convolution as the backward pass of a valid
+        # convolution. We just need to set up a suitable 'fake' valid
+        # convolution.
        img = gpu_contiguous(img)  # cudnn v1 and v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
-                        shape_i(kerns, 1, fgraph), shape2, shape3)
+                              shape_i(kerns, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode)(out.shape, kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)
@@ -720,7 +1189,87 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                                       desc_op.border_mode,
                                       desc_op.subsample)
    out = gpu_alloc_empty(*out_shp)
-    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
+    return GpuDnnConv(algo=algo)(img, kerns, out, desc)
+
+
+def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
+               conv_mode='conv', direction_hint=None, workmem=None,
+               algo='none'):
+    """
+    GPU convolution using cuDNN from NVIDIA.
+
+    The memory layout to use is 'bct01', that is 'batch', 'channel',
+    'first dim', 'second dim', 'third dim' in that order.
+
+    :param img: images to do the convolution over
+    :param kerns: convolution filters
+    :param border_mode: one of 'valid', 'full'; additionally, the padding size
+        could be directly specified by an integer or a pair of integers
+    :param subsample: perform subsampling of the output (default: (1, 1, 1))
+    :param conv_mode: perform convolution (kernels flipped) or
+        cross-correlation. One of 'conv', 'cross'. (default: 'conv')
+    :param direction_hint: Used by graph optimizers to change algorithm choice.
+        By default, GpuDnnConv will be used to carry out the convolution.
+        If border_mode is 'valid', subsample is (1,1,1) and direction_hint is
+        'bprop weights', it will use GpuDnnConvGradW.
+        This parameter is used internally by graph optimizers and may be
+        removed at any time without a deprecation period. You have been warned.
+    :param workmem: *deprecated*, use param algo instead
+    :param algo: convolution implementation to use. Only 'none' is implemented
+        for the conv3d. Default is the value of
+        :attr:`config.dnn.conv.algo_fwd.
+
+    :warning: The cuDNN library only works with GPU that have a compute
+      capability of 3.0 or higer.  This means that older GPU will not
+      work with this Op.
+    :warning: dnn_conv3d only works with cuDNN library 3.0
+
+    """
+
+    # Check if deprecated param 'workmem' is used
+    if workmem is not None:
+        warnings.warn(("dnn_conv3d: parameter 'workmem' is deprecated. Use "
+                       "'algo' instead."), stacklevel=3)
+        assert algo is None
+        algo = workmem
+
+    # Ensure the value of direction_hint is supported
+    assert direction_hint in [None, 'bprop weights', 'forward']
+
+    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
+    if (border_mode == 'valid' and subsample == (1, 1, 1) and
+            direction_hint == 'bprop weights'):
+        # Special case: We are asked to use GpuDnnConvGradW. We need to set
+        # up a suitable 'fake' convolution to compute the gradient for.
+        img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
+        if conv_mode == 'conv':
+            # We need to flip manually. These 'kerns' are not the kernels
+            # that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
+            kerns = kerns[:, :, ::-1, ::-1, ::-1]
+        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
+        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
+        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
+        shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
+        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
+                              shape_i(img, 1, fgraph), shape2, shape3, shape4)
+        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
+                              conv_mode='cross')(img.shape, out.shape)
+        conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
+        return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))
+
+    # Standard case: We use GpuDnnConv with suitable padding.
+    # contig_version will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = gpu_contiguous(img)
+    kerns = gpu_contiguous(kerns)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(img.shape, kerns.shape)
+    desc_op = desc.owner.op
+    out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
+                                         desc_op.border_mode,
+                                         desc_op.subsample)
+    out = gpu_alloc_empty(*out_shp)
+    return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)


 class GpuDnnPoolDesc(GpuOp):
@@ -758,14 +1307,20 @@ class GpuDnnPoolDesc(GpuOp):
            mode = 'average_inc_pad'
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode
-        assert len(ws) == 2
+
+        assert len(ws) == len(stride) and len(stride) == len(pad)
+        assert len(ws) in (2, 3)
        self.ws = ws
-        assert len(stride) == 2
        self.stride = stride
-        assert len(stride) == 2
        self.pad = pad
+
        if (pad[0] != 0 or pad[1] != 0) and version() == -1:
            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+        if self.get_ndim() == 3 and version() < (3000, 3000):
+            raise RuntimeError("CuDNN 3d pooling requires CuDNN v3")
+
+    def get_ndim(self):
+        return len(self.ws)

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -803,22 +1358,14 @@ class GpuDnnPoolDesc(GpuOp):
                 "descriptor: %%s", cudnnGetErrorString(err));
    %(fail)s
  }
-#ifndef CUDNN_VERSION
-  err = cudnnSetPoolingDescriptor(
-  %(desc)s,
-  %(mode_flag)s,
-  %(wsX)d, %(wsY)d,
-  %(stridex)d, %(stridey)d
-  );
-#else
-  err = cudnnSetPooling2dDescriptor(
-  %(desc)s,
-  %(mode_flag)s,
-  %(wsX)d, %(wsY)d,
-  %(padX)d, %(padY)d,
-  %(stridex)d, %(stridey)d
-  );
-#endif
+  {
+    int win[%(nd)d] = {%(win)s};
+    int pad[%(nd)d] = {%(pad)s};
+    int str[%(nd)d] = {%(str)s};
+    err = cudnnSetPoolingNdDescriptor(
+      %(desc)s, %(mode_flag)s, %(nd)d,
+      win, pad, str);
+  }
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
                 cudnnGetErrorString(err));
@@ -826,46 +1373,49 @@ class GpuDnnPoolDesc(GpuOp):
  }
 }
 """ % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
-           wsX=self.ws[0], wsY=self.ws[1],
-           stridex=self.stride[0], stridey=self.stride[1],
-           padX=self.pad[0], padY=self.pad[1])
+           nd=self.get_ndim(), win=', '.join(str(w) for w in self.ws),
+           pad=', '.join(str(p) for p in self.pad),
+           str=', '.join(str(s) for s in self.stride))

    def c_code_cache_version(self):
-        return (2, version())
+        return (3, version())


 class GpuDnnPool(DnnBase):
    """
    Pooling.

-    :param img: the image 4d tensor.
+    :param img: the image 4d or 5d tensor.
    :param desc: the pooling descriptor.
    """
    __props__ = ()

    def make_node(self, img, desc):
        img = as_cuda_ndarray_variable(img)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-
        if not isinstance(desc.type, CDataType) \
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        return Apply(self, [img, desc],
-                     [img.type()])
+        dop = desc.owner.op
+        e_ndim = dop.get_ndim() + 2  # 4 or 5
+
+        if img.type.ndim != e_ndim:
+            raise TypeError('img must be %dD tensor' % e_ndim)
+
+        return Apply(self, [img, desc], [img.type()])

    def infer_shape(self, node, shape):
        desc = node.inputs[1].owner.op
-        kh, kw = desc.ws
-        sh, sw = desc.stride
-        padh, padw = desc.pad
-        return [(
-            shape[0][0],
-            shape[0][1],
-            (shape[0][2] + 2*padh - kh)//sh + 1,
-            (shape[0][3] + 2*padw - kw)//sw + 1
-        )]
+        nd = desc.get_ndim()
+        w = desc.ws
+        s = desc.stride
+        p = desc.pad
+        ret = [shape[0][0], shape[0][1],
+               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
+               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1]
+        if nd == 3:
+            ret.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
+        return [ret]

    def c_support_code_struct(self, node, name):
        return """
@@ -879,12 +1429,12 @@ cudnnStatus_t err%(name)s;
 input%(name)s = NULL;
 output%(name)s = NULL;
 if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(inp): %%s", cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
 if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %%s", cudnnGetErrorString(err%(name)s));
  %(fail)s
 }
@@ -900,65 +1450,51 @@ if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
        desc = inputs[1]
        out, = outputs

-        set_in = c_set_tensor4d(inputs[0], "input" + str(name),
-                                'err' + name, sub['fail'])
-
-        set_out = c_set_tensor4d(out, "output" + str(name),
-                                 'err' + name, sub['fail'])
-
        return """
 cudnnStatus_t err%(name)s;

-int %(out)s_dims[4];
+int %(out)s_dims[5];

 if (!CudaNdarray_is_c_contiguous(%(input)s)) {
  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
  %(fail)s
 }

-%(set_in)s
+if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
+  %(fail)s

 cudnnPoolingMode_t mode;
-int wsX, wsY, vpad, hpad, strideX, strideY;
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnGetPoolingDescriptor(
-        %(desc)s, &mode,
-        &wsX, &wsY,
-        &strideX, &strideY);
-#else
-err%(name)s = cudnnGetPooling2dDescriptor(
-        %(desc)s, &mode,
-        &wsX, &wsY,
-        &vpad, &hpad,
-        &strideX, &strideY);
-#endif
+int win[3];
+int pad[3];
+int str[3];
+int ndims;
+err%(name)s = cudnnGetPoolingNdDescriptor(
+        %(desc)s, 3,
+        &mode, &ndims,
+        win, pad, str);

 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
+               "GpuDnnPool: error doing cudnnGetPoolingNdDescriptor operation: %%s",
               cudnnGetErrorString(err%(name)s));
  %(fail)s
 }

 %(out)s_dims[0] = CudaNdarray_HOST_DIMS(%(input)s)[0];
 %(out)s_dims[1] = CudaNdarray_HOST_DIMS(%(input)s)[1];
-%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
-%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
+%(out)s_dims[2] = (CudaNdarray_HOST_DIMS(%(input)s)[2] + (pad[0]*2) - win[0]) / str[0] + 1;
+%(out)s_dims[3] = (CudaNdarray_HOST_DIMS(%(input)s)[3] + (pad[1]*2) - win[1]) / str[1] + 1;
+if (ndims == 3)
+  %(out)s_dims[4] = (CudaNdarray_HOST_DIMS(%(input)s)[4] + (pad[2]*2) - win[2]) / str[2] + 1;

-if (CudaNdarray_prep_output(&%(out)s, 4, %(out)s_dims) != 0)
+if (CudaNdarray_prep_output(&%(out)s, ndims+2, %(out)s_dims) != 0)
 {
  %(fail)s
 }

-%(set_out)s
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnPoolingForward(
-_handle,
-%(desc)s,
-%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
-%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
-);
-#else
+if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0)
+  %(fail)s
+
 {
 const float alpha = 1;
 const float beta = 0;
@@ -971,7 +1507,6 @@ _handle,
 %(output_desc)s, CudaNdarray_DEV_DATA(%(out)s)
 );
 }
-#endif
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError,
               "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
@@ -979,8 +1514,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  %(fail)s
 }
 """ % dict(out=out, desc=desc, fail=sub['fail'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inputs[0],
+           name=name, input=inputs[0],
           input_desc="input"+name,
           output_desc="output"+name)

@@ -1001,7 +1535,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        return [[1], [0]]

    def c_code_cache_version(self):
-        return (6, version())
+        return (7, version())


 class GpuDnnPoolGrad(DnnBase):
@@ -1010,27 +1544,30 @@ class GpuDnnPoolGrad(DnnBase):

    :param inp: the input of the pooling.
    :param out: the output of the pooling in the forward.
-    :param inp_grad: same size as out, but is the corresponding gradient information.
+    :param inp_grad: same size as out, but is the corresponding gradient
+    information.
    :param desc: The pooling descriptor.
    """
    __props__ = ()

    def make_node(self, inp, out, inp_grad, desc):
+        if not isinstance(desc.type, CDataType) \
+                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
+            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+
+        nd = desc.owner.op.get_ndim() + 2  # 4 or 5
+
        inp = as_cuda_ndarray_variable(inp)
-        if inp.type.ndim != 4:
-            raise TypeError('inp must be 4D tensor')
+        if inp.type.ndim != nd:
+            raise TypeError('inp must be %dD tensor' % (nd,))

        inp_grad = as_cuda_ndarray_variable(inp_grad)
-        if inp_grad.type.ndim != 4:
-            raise TypeError('inp_grad must be 4D tensor')
+        if inp_grad.type.ndim != nd:
+            raise TypeError('inp_grad must be %dD tensor' % (nd,))

        out = as_cuda_ndarray_variable(out)
-        if out.type.ndim != 4:
-            raise TypeError('out must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
+        if out.type.ndim != nd:
+            raise TypeError('out must be %dD tensor' % (nd,))

        return Apply(self, [inp, out, inp_grad, desc],
                     [inp.type()])
@@ -1091,18 +1628,6 @@ if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(nam
        out, inp, inp_grad, desc = inputs
        out_grad, = outputs

-        set_in = "\n".join([
-            c_set_tensor4d(inp, "input" + name,
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(inp_grad, "input_grad" + name,
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(out, "output" + name,
-                           'err' + name, sub['fail'])
-        ])
-
-        set_out = c_set_tensor4d(out, "output_grad" + name,
-                                 'err' + name, sub['fail'])
-
        return """
 cudnnStatus_t err%(name)s;

@@ -1124,25 +1649,35 @@ if (!CudaNdarray_is_c_contiguous(%(output)s)) {
  %(fail)s
 }

-%(set_in)s
+if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
+  %(fail)s
+if (c_set_tensorNd(%(input_grad)s, %(input_grad_desc)s) != 0)
+  %(fail)s
+if (c_set_tensorNd(%(output)s, %(output_desc)s) != 0)
+  %(fail)s

-if (CudaNdarray_prep_output(&%(output_grad)s, 4,
+if (CudaNdarray_prep_output(&%(output_grad)s,
+                            %(output)s->nd,
                            CudaNdarray_HOST_DIMS(%(output)s)) != 0)
 {
  %(fail)s
 }

-%(set_out)s
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnPoolingBackward(
-_handle,
-%(desc)s,
-%(input_desc)s, CudaNdarray_DEV_DATA(%(input)s),
-%(input_grad_desc)s, CudaNdarray_DEV_DATA(%(input_grad)s),
-%(output_desc)s, CudaNdarray_DEV_DATA(%(output)s),
-%(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
-);
-#else
+// Get the pooling_mode to be used. Variable 'tmp' is used because we don't
+// care about the other outputs of the function
+cudnnPoolingMode_t pooling_mode;
+int tmp;
+err%(name)s = cudnnGetPoolingNdDescriptor(%(desc)s, 0, &pooling_mode, &tmp,
+                                          &tmp, &tmp, &tmp);
+if (err%(name)s != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_RuntimeError,
+               "GpuDnnPoolGrad: could not obtain pooling mode");
+ %(fail)s
+}
+
+if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0)
+  %(fail)s
+
 {
 const float alpha = 1;
 const float beta = 0;
@@ -1157,45 +1692,22 @@ _handle,
 %(output_grad_desc)s, CudaNdarray_DEV_DATA(%(output_grad)s)
 );
 }
-#endif
 if (err%(name)s != CUDNN_STATUS_SUCCESS) {
  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPoolGrad: error doing operation: %%s. "
-               "input.shape=(%%d, %%d, %%d, %%d) "
-               "input_grad.shape=(%%d, %%d, %%d, %%d) "
-               "output.shape=(%%d, %%d, %%d, %%d) "
-               "output_grad.shape=(%%d, %%d, %%d, %%d)",
-               cudnnGetErrorString(err%(name)s),
-               CudaNdarray_HOST_DIMS(%(input)s)[0],
-               CudaNdarray_HOST_DIMS(%(input)s)[1],
-               CudaNdarray_HOST_DIMS(%(input)s)[2],
-               CudaNdarray_HOST_DIMS(%(input)s)[3],
-               CudaNdarray_HOST_DIMS(%(input_grad)s)[0],
-               CudaNdarray_HOST_DIMS(%(input_grad)s)[1],
-               CudaNdarray_HOST_DIMS(%(input_grad)s)[2],
-               CudaNdarray_HOST_DIMS(%(input_grad)s)[3],
-               CudaNdarray_HOST_DIMS(%(output)s)[0],
-               CudaNdarray_HOST_DIMS(%(output)s)[1],
-               CudaNdarray_HOST_DIMS(%(output)s)[2],
-               CudaNdarray_HOST_DIMS(%(output)s)[3],
-               CudaNdarray_HOST_DIMS(%(output_grad)s)[0],
-               CudaNdarray_HOST_DIMS(%(output_grad)s)[1],
-               CudaNdarray_HOST_DIMS(%(output_grad)s)[2],
-               CudaNdarray_HOST_DIMS(%(output_grad)s)[3]
-               );
-  %(fail)s
+               "GpuDnnPoolGrad: error doing operation: %%s.",
+               cudnnGetErrorString(err%(name)s));
+ %(fail)s
 }
 """ % dict(output_grad=out_grad, desc=desc,
-           fail=sub['fail'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inp, input_grad=inp_grad, output=out,
+           fail=sub['fail'], name=name,
+           input=inp, input_grad=inp_grad, output=out,
           input_desc="input"+name,
           input_grad_desc="input_grad"+name,
           output_desc="output"+name,
           output_grad_desc="output_grad"+name)

    def c_code_cache_version(self):
-        return (5, version())
+        return (7, version())

    def infer_shape(self, node, shape):
        return [shape[0]]
@@ -1216,6 +1728,9 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    :param pad: (padX, padY) padding information.
        padX is the size of the left and right borders,
        padY is the size of the top and bottom borders.
+    :param nd: dimensions of pooling, can be 2 or 3 for 2d or 3d pooling
+        If set to 3 all other params (except mode) must have an extra
+        dimension to match. 3 is only available for cudnn v3

    :warning: The cuDNN library only works with GPU that have a compute
      capability of 3.0 or higer.  This means that older GPU will not
@@ -1232,8 +1747,9 @@ class GpuDnnSoftmaxBase(DnnBase):
    Op for the cuDNN Softmax.

    :param tensor_format: Whether the data format is 'bc01' or 'b01c'.
-    :param algo: 'fast' or 'accurate' indicating whether computations should be
-        optimized for speed or accuracy respectively.
+    :param algo: 'fast', 'accurate' or 'log' indicating whether, respectively,
+        computations should be optimized for speed, for accuracy, or if CuDNN
+        should rather compute the log-softmax instead.
    :param mode: 'instance' or 'channel' indicating whether the softmax should
        be computed per image across 'c01' or per spatial location '01' per
        image across 'c'.
@@ -1246,7 +1762,10 @@ class GpuDnnSoftmaxBase(DnnBase):
        DnnBase.__init__(self)
        self.tensor_format = tensor_format

-        assert(algo in ('fast', 'accurate'))
+        if algo == 'log' and version() < (3000, 3000):
+            raise RuntimeError("CuDNN log-softmax requires CuDNN v3")
+
+        assert(algo in ('fast', 'accurate', 'log'))
        self.algo = algo

        assert(mode in ('instance', 'channel'))
@@ -1319,9 +1838,11 @@ cudnnStatus_t err%(name)s;
            mode = 0

        if self.algo == 'fast':
-            algo = 1
+            algo = "CUDNN_SOFTMAX_FAST"
+        elif self.algo == "log":
+            algo = "CUDNN_SOFTMAX_LOG"
        else:
-            algo = 0
+            algo = "CUDNN_SOFTMAX_ACCURATE"

        # Setup configuration variables.
        result = """
@@ -1330,9 +1851,7 @@ cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
 if (%(tensor_format)d == 1)
  format%(name)s = CUDNN_TENSOR_NHWC;

-cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
-if (%(algo)d == 1)
-  algo%(name)s = CUDNN_SOFTMAX_FAST;
+cudnnSoftmaxAlgorithm_t algo%(name)s = %(algo)s;

 cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
 if (%(mode)d == 1)
@@ -1557,7 +2076,7 @@ if True:
                isinstance(dest.owner.op, GpuAllocEmpty) and
                len(dest.clients) > 1):
            inputs[2] = gpu_alloc_empty(*dest.owner.inputs)
-        return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
+        return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]

    @local_optimizer([GpuDnnConvGradW], inplace=True)
    def local_dnn_convgw_inplace(node):
@@ -1595,7 +2114,7 @@ if True:
    def local_dnn_conv_alpha_merge(node, *inputs):
        if not dnn_available() or version() == -1:
            return None
-        return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+        return [GpuDnnConv(algo=node.op.algo)(*inputs)]

    @register_opt('cudnn')
    @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
@@ -1615,7 +2134,7 @@ if True:
    @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
    def local_dnn_conv_output_merge(node, *inputs):
        inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-        return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+        return [GpuDnnConv(algo=node.op.algo)(*inputs)]

    @register_opt('cudnn')
    @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
@@ -1713,6 +2232,26 @@ if True:
            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
            return [out]

+    @register_opt('cudnn')
+    @local_optimizer([GpuElemwise])
+    def local_log_softmax_dnn(node):
+        # The log-softmax implementation is only available starting at CuDNN V3
+        if not dnn_available() or version() < (3000, 3000):
+            return
+        if (isinstance(node.op, GpuElemwise) and
+                isinstance(node.op.scalar_op, Log) and
+                node.inputs[0].owner and
+                isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
+                len(node.inputs[0].owner.out.clients) == 1):
+
+            log_input = node.inputs[0]
+            softmax_node = log_input.owner
+
+            new_softmax_node = GpuDnnSoftmax(softmax_node.op.tensor_format,
+                                             'log', softmax_node.op.mode)
+            new_log_softmax = new_softmax_node(softmax_node.inputs[0])
+            return [new_log_softmax]
+
    class NoCuDNNRaise(Optimizer):
        def apply(self, fgraph):
            """ Raise a RuntimeError if cudnn can't be used"""
@@ -1730,8 +2269,8 @@ if True:
    def local_softmax_dnn_grad(node):
        if (isinstance(node.op, SoftmaxGrad) and
            ((node.inputs[0].owner and
-              isinstance(node.inputs[0].owner.op, HostFromGpu))
-             or (node.inputs[1].owner and
+              isinstance(node.inputs[0].owner.op, HostFromGpu)) or
+             (node.inputs[1].owner and
                 isinstance(node.inputs[1].owner.op, HostFromGpu)))):
            if not dnn_available():
                return

--- a/theano/sandbox/cuda/dnn_base.c
+++ b/theano/sandbox/cuda/dnn_base.c
 #section support_code
 static cudnnHandle_t _handle = NULL;

+
 static int
-c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
-  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
-    desc, CUDNN_DATA_FLOAT,
-    CudaNdarray_HOST_DIMS(var)[0],
-    CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_DIMS(var)[2],
-    CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
-    CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
-    );
+c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
+
+
+  int dim = CudaNdarray_NDIM(var);
+  int strides[dim];
+  int default_str = 1;
+
+  for (int i = dim-1; i >= 0; i--)
+  {
+    if (CudaNdarray_HOST_STRIDES(var)[i])
+      strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
+    else
+      strides[i] = default_str;
+    default_str *= CudaNdarray_HOST_DIMS(var)[i];
+  }
+
+  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var),
+                                                 strides);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set tensor4d descriptor: %s"
-		 "shapes=%d %d %d %d strides=%d %d %d %d",
-		 cudnnGetErrorString(err),
-		 CudaNdarray_HOST_DIMS(var)[0],
-		 CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_DIMS(var)[2],
-		 CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
-		 CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
-      );
+		 "Could not set tensorNd descriptor: %s"
+		 "dim=%d",
+		 cudnnGetErrorString(err), dim);
    return -1;
  }
  return 0;
 }

+
 static int
-c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
+c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
  if (!CudaNdarray_is_c_contiguous(var)) {
    PyErr_SetString(PyExc_ValueError,
 		    "Only contiguous filters (kernels) are supported.");
    return -1;
  }
-  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
-    desc, CUDNN_DATA_FLOAT,
-    CudaNdarray_HOST_DIMS(var)[0],
-    CudaNdarray_HOST_DIMS(var)[1],
-    CudaNdarray_HOST_DIMS(var)[2],
-    CudaNdarray_HOST_DIMS(var)[3]
-    );
+  int dim = CudaNdarray_NDIM(var);
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
+                                                 CudaNdarray_HOST_DIMS(var));
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s."
-		 " dims= %d %d %d %d",
-		 cudnnGetErrorString(err),
-		 CudaNdarray_HOST_DIMS(var)[0],
-		 CudaNdarray_HOST_DIMS(var)[1],
-		 CudaNdarray_HOST_DIMS(var)[2],
-		 CudaNdarray_HOST_DIMS(var)[3]);
+		 " dims= %d",
+		 cudnnGetErrorString(err), dim);
    return -1;
  }
  return 0;

--- a/theano/sandbox/cuda/dnn_conv_base.c
+++ b/theano/sandbox/cuda/dnn_conv_base.c
@@ -3,6 +3,24 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
 cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
 cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);

+/* Keep track, from one execution to another, of the dimension of the data
+and the algorithms, if any, that were selected according to these dimensions
+and according to the amount of memory available at that time.
+
+Note : Implementation selection for backward convolution only exists starting
+at V3.
+*/
+int APPLY_SPECIFIC(previous_input_shape)[5];
+int APPLY_SPECIFIC(previous_kerns_shape)[5];
+int APPLY_SPECIFIC(previous_output_shape)[5];
+bool APPLY_SPECIFIC(previous_algo_set);
+cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
+
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
+cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
+#endif
+
 #section init_code_struct

 cudnnStatus_t APPLY_SPECIFIC(err);
@@ -10,21 +28,38 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(kerns) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
 	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", 
+  PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
 	       cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }

+for (int i = 0; i < 5; i++)
+{
+  APPLY_SPECIFIC(previous_input_shape)[i] = 0;
+  APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
+  APPLY_SPECIFIC(previous_output_shape)[i] = 0;
+}
+
+APPLY_SPECIFIC(previous_algo_set) = false;
+
+// Select default implementations for the case where the convolution
+// implementations should be selected based on the size of the data.
+APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+#endif
+
 #section cleanup_code_struct

 if (APPLY_SPECIFIC(input) != NULL)

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -4,42 +4,235 @@ int
 APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                         CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
                         float alpha, float beta, CudaNdarray **output) {
+
  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size\n");
+                    "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

+  int nb_dim = CudaNdarray_NDIM(input);
+
 #ifdef CONV_INPLACE
  Py_XDECREF(*output);
  *output = om;
  Py_INCREF(*output);
 #else
-  if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
+  if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
    return 1;
 #endif

-  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
-    return 1;
+   if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
+     return 1;

  {
    size_t worksize;
    void *workspace;
+    cudnnConvolutionFwdAlgo_t chosen_algo;
+
+
+    if (CHOOSE_ALGO)
+    {
+
+      // A new convolution implementation should be selected, based either on
+      // timing or heuristics if in one of the two following cases :
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
+      {
+        // Reuse the previous implementation if the inputs and the kernels
+        // have the same shapes as they had when the previous implementation
+        // was selected
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+          same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
+                          APPLY_SPECIFIC(previous_input_shape)[i]);
+          same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
+                          APPLY_SPECIFIC(previous_kerns_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
+      }
+
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
+      {
+
+        // Obtain a convolution algorithm appropriate for the input and kernel
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+          // Time the different implementations to choose the best one
+          int requestedCount = 1;
+          int count;
+          cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionForwardAlgorithm(_handle,
+                                                     APPLY_SPECIFIC(input),
+                                                     APPLY_SPECIFIC(kerns),
+                                                     desc,
+                                                     APPLY_SPECIFIC(output),
+                                                     requestedCount,
+                                                     &count,
+                                                     &choosen_algo_perf);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConv: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
+
+          chosen_algo = choosen_algo_perf.algo;
+#endif
+        }
+        else
+        {
+          // The implementation should be chosen using heuristics based on the
+          // input shapes and the amount of memory available.
+
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
+
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionForwardAlgorithm(_handle,
+                                                    APPLY_SPECIFIC(input),
+                                                    APPLY_SPECIFIC(kerns),
+                                                    desc,
+                                                    APPLY_SPECIFIC(output),
+                                                    CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                                                    free,
+                                                    &chosen_algo);
+
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConv: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
+        }
+
+        // Store the shapes of the inputs and kernels as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_algo) = chosen_algo;
+        for (int i = 0; i < nb_dim; i++)
+        {
+            APPLY_SPECIFIC(previous_input_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(input)[i];
+            APPLY_SPECIFIC(previous_kerns_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(kerns)[i];
+        }
+      }
+      else
+      {
+          // Reuse the previously chosen convolution implementation
+          chosen_algo = APPLY_SPECIFIC(previous_algo);
+      }
+    }
+    else
+    {
+      chosen_algo = CONV_ALGO;
+    }
+
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+    // The FFT implementation (only in V3 and onward) does not support strides,
+    // 1x1 filters or inputs with a spatial dimension larger than 1024.
+    // If the chosen implementation is FFT, validate that it can be used
+    // on the current data and default on a safe implementation if it
+    // can't.
+    // Following code is 2d-specific, but it is fine as ftt is defined only for
+    // 2d-filters
+    if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
+    {
+
+      // Extract the properties of the convolution descriptor
+      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      cudnnConvolutionMode_t mode;
+      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+                                            &stride_v, &stride_h,
+                                            &upscale_x, &upscale_y,
+                                            &mode);
+
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConv: error getting convolution properties: %s",
+                     cudnnGetErrorString(err));
+        return 1;
+      }
+
+      // Extract the spatial size of the filters
+      int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
+      int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
+
+      // Extract the spatial size of the input
+      int input_h = CudaNdarray_HOST_DIMS(input)[3];
+      int input_w = CudaNdarray_HOST_DIMS(input)[4];
+
+      // Ensure that the selected implementation supports the requested
+      // convolution. Fall back to a safe implementation otherwise.
+      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      }
+    }
+#endif
+
+#if defined(CUDNN_VERSION) && CUDNN_VERSION < 3000
+    // In versions before V3, CuDNN did not support kernels larger than the
+    // inputs in any spatial dimension, even if padding was used such that the
+    // padded inputs were larger than the kernels. If the kernels are larger
+    // then the inputs, raise an error message.
+
+    bool shape_mismatch = false;
+    for (int i=2; i < nb_dim; i++){
+        shape_mismatch = shape_mismatch || (CudaNdarray_HOST_DIMS(kerns)[i] >
+                                            CudaNdarray_HOST_DIMS(input)[i]);
+    }
+
+    if (shape_mismatch){
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConv: the current version of CuDNN does not support "
+                   "kernels larger than the inputs in any spatial dimension, "
+                   "even if the inputs are padded such that the padded inputs "
+                   "are larger than the kernels. Update your installation of "
+                   "CuDNN to V3 or more recent to solve the issue.");
+      return 1;
+    }
+#endif

    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
-                                                  CONV_ALGO,
+                                                  chosen_algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
@@ -47,7 +240,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
                   cudnnGetErrorString(err));
      return 1;
    }
-
    workspace = get_work_mem(worksize);
    if (workspace == NULL && worksize != 0)
      return 1;
@@ -58,7 +250,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
      desc,
-      CONV_ALGO,
+      chosen_algo,
      workspace, worksize,
      (void *)&beta,
      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -12,25 +12,225 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    return 1;
  }

-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
-  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

+  int nb_dim = CudaNdarray_NDIM(output);
+
 #ifdef CONV_INPLACE
  Py_XDECREF(*input);
  *input = im;
  Py_INCREF(*input);
 #else
-  if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
+  if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
    return 1;
 #endif

-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+  {
+    size_t worksize;
+    void *workspace;
+    cudnnConvolutionBwdDataAlgo_t chosen_algo;
+
+    if (CHOOSE_ALGO)
+    {
+
+      // A new convolution implementation should be selected, based either on
+      // timing or heuristics, if in one of the two following cases :
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
+      {
+        // Reuse the previous implementation if the the kernels and the outputs
+        // have the same shapes as they had when the previous implementation
+        // was selected
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+            same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
+                            APPLY_SPECIFIC(previous_kerns_shape)[i]);
+            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+                            APPLY_SPECIFIC(previous_output_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
+      }
+
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
+      {
+        // Obtain a convolution algorithm appropriate for the kernel and output
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+          // Time the different implementations to choose the best one
+          int requestedCount = 1;
+          int count;
+          cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
+                                                          APPLY_SPECIFIC(kerns),
+                                                          APPLY_SPECIFIC(output),
+                                                          desc,
+                                                          APPLY_SPECIFIC(input),
+                                                          requestedCount,
+                                                          &count,
+                                                          &choosen_algo_perf);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: "
+                         "%s", cudnnGetErrorString(err));
+            return 1;
+          }
+
+          chosen_algo = choosen_algo_perf.algo;
+        }
+        else
+        {
+          // Choose the convolution implementation using heuristics based on the
+          // shapes of the inputs and the amount of memory available.
+
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
+
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(input),
+                                                         CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                                                         free,
+                                                         &chosen_algo);
+
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradI: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
+        }
+
+        // Store the shapes of the kernels and output as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
+        for (int i = 0; i < nb_dim; i++)
+        {
+            APPLY_SPECIFIC(previous_kerns_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(kerns)[i];
+            APPLY_SPECIFIC(previous_output_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(output)[i];
+        }
+
+      }
+      else
+      {
+        // Reuse the previously chosen convlution implementation
+        chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
+      }
+    }
+    else
+    {
+        chosen_algo = CONV_ALGO;
+    }
+
+    // The FFT implementation (only in v3 and onward) does not support strides,
+    // 1x1 filters or inputs with a spatial dimension larger than 1024.
+    // If the chosen implementation is FFT, validate that it can be used
+    // on the current data and default on a safe implementation if it
+    // can't.
+    if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
+    {
+
+      // Extract the properties of the convolution descriptor
+      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      cudnnConvolutionMode_t mode;
+      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+                                            &stride_v, &stride_h,
+                                            &upscale_x, &upscale_y,
+                                            &mode);
+
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConvGradI: error getting convolution properties: %s",
+                     cudnnGetErrorString(err));
+        return 1;
+      }
+
+      // Extract the spatial size of the filters
+      int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
+      int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
+
+      // Extract the spatial size of the input
+      int input_h = CudaNdarray_HOST_DIMS(*input)[3];
+      int input_w = CudaNdarray_HOST_DIMS(*input)[4];
+
+      // Ensure that the selected implementation supports the requested
+      // convolution. Fall back to a safe implementation otherwise.
+      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+    }
+
+    // Infer required workspace size from the chosen implementation
+    err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
+                                                       APPLY_SPECIFIC(kerns),
+                                                       APPLY_SPECIFIC(output),
+                                                       desc,
+                                                       APPLY_SPECIFIC(input),
+                                                       chosen_algo,
+                                                       &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConvGradI: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    // Allocate workspace for the convolution
+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+
+    // Perform the convolution
+    err = cudnnConvolutionBackwardData_v3(
+      _handle,
+      (void *)&alpha,
+      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
+      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
+      desc,
+      chosen_algo,
+      workspace, worksize,
+      (void *)&beta,
+      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+  }
+#else
  err = cudnnConvolutionBackwardData(
    _handle,
    (void *)&alpha,
@@ -39,6 +239,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
    desc,
    (void *)&beta,
    APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
+#endif
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
                 cudnnGetErrorString(err));

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
 #section support_code_struct

-int 
+int
 APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
                        CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
                        float alpha, float beta, CudaNdarray **kerns) {
@@ -8,29 +8,230 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,

  if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size\n");
+                   "GpuDnnConv images and kernel must have the same stack size\n");
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

+  int nb_dim = CudaNdarray_NDIM(output);
+
 #ifdef CONV_INPLACE
  Py_XDECREF(*kerns);
  *kerns = km;
  Py_INCREF(*kerns);
 #else
-  if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
+  if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
    return 1;
  if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
    return 1;
 #endif

-  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
+  if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
+  {
+    size_t worksize;
+    void *workspace;
+    cudnnConvolutionBwdFilterAlgo_t chosen_algo;
+
+    if (CHOOSE_ALGO)
+    {
+
+      // A new convolution implementation should be selected, based either on
+      // timing or heuristics, if in one of the two following cases :
+      // - The implementation should only be chosen during the first execution
+      //   of an apply node and this is the first execution of the apply node.
+      // - The implementation should be chosen as often as necessary and the
+      //   shapes of the inputs differ from the last time an implementation
+      //   was chosen.
+      bool reuse_previous_algo;
+      if (CHOOSE_ALGO_ONCE)
+      {
+        // Only choose a new implementation of none has been chosen before.
+        reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
+      }
+      else
+      {
+        // Reuse the previous implementation if the the kernels and the outputs
+        // have the same shapes as they had when the previous implementation
+        // was selected
+        bool same_shapes = true;
+        for (int i = 0; (i < nb_dim) && same_shapes; i++)
+        {
+            same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
+                            APPLY_SPECIFIC(previous_input_shape)[i]);
+            same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
+                            APPLY_SPECIFIC(previous_output_shape)[i]);
+        }
+        reuse_previous_algo = same_shapes;
+      }
+
+      // If the previously choosen implementation can't be reused, select a
+      // new one based on the shapes of the current inputs
+      if (!reuse_previous_algo)
+      {
+        // Obtain a convolution algorithm appropriate for the input and output
+        // shapes. Either by choosing one according to heuristics or by making
+        // CuDNN time every implementation and choose the best one.
+        if (CHOOSE_ALGO_TIME)
+        {
+          // Time the different implementations to choose the best one
+          int requestedCount = 1;
+          int count;
+          cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
+          err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
+                                                            APPLY_SPECIFIC(input),
+                                                            APPLY_SPECIFIC(output),
+                                                            desc,
+                                                            APPLY_SPECIFIC(kerns),
+                                                            requestedCount,
+                                                            &count,
+                                                            &choosen_algo_perf);
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradW: error selecting convolution algo: "
+                         "%s", cudnnGetErrorString(err));
+            return 1;
+          }
+
+          chosen_algo = choosen_algo_perf.algo;
+        }
+        else
+        {
+          // Choose the convolution implementation using heuristics based on the
+          // shapes of the inputs and the amount of memory available.
+
+          // Get the amount of available memory
+          size_t free = 0, total = 0;
+          cudaError_t err2 = cudaMemGetInfo(&free, &total);
+          if (err2 != cudaSuccess){
+            cudaGetLastError();
+            fprintf(stderr,
+                    "Error when trying to find the memory information"
+                    " on the GPU: %s\n", cudaGetErrorString(err2));
+            return 1;
+          }
+
+          // Use heuristics to choose the implementation
+          err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
+                                                           APPLY_SPECIFIC(input),
+                                                           APPLY_SPECIFIC(output),
+                                                           desc,
+                                                           APPLY_SPECIFIC(kerns),
+                                                           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                                                           free,
+                                                           &chosen_algo);
+
+          if (err != CUDNN_STATUS_SUCCESS) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuDnnConvGradW: error selecting convolution algo: %s",
+                         cudnnGetErrorString(err));
+            return 1;
+          }
+        }
+
+        // Store the shapes of the inputs and kernels as well as the chosen
+        // algorithm for future use.
+        APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
+        for (int i = 0; i < nb_dim; i++)
+        {
+            APPLY_SPECIFIC(previous_input_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(input)[i];
+            APPLY_SPECIFIC(previous_output_shape)[i] =
+                                            CudaNdarray_HOST_DIMS(output)[i];
+        }
+
+      }
+      else
+      {
+        // Reuse the previously chosen convlution implementation
+        chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
+      }
+    }
+    else
+    {
+        chosen_algo = CONV_ALGO;
+    }
+
+    // The FFT implementation (only in v3 and onward) does not support strides,
+    // 1x1 filters or inputs with a spatial dimension larger than 1024.
+    // If the chosen implementation is FFT, validate that it can be used
+    // on the current data and default on a safe implementation if it
+    // can't.
+    if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4)
+    {
+
+      // Extract the properties of the convolution descriptor
+      int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
+      cudnnConvolutionMode_t mode;
+      err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
+                                            &stride_v, &stride_h,
+                                            &upscale_x, &upscale_y,
+                                            &mode);
+
+      if (err != CUDNN_STATUS_SUCCESS) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "GpuDnnConvGradW: error getting convolution properties: %s",
+                     cudnnGetErrorString(err));
+        return 1;
+      }
+
+      // Extract the spatial size of the filters
+      int filter_h = CudaNdarray_HOST_DIMS(*kerns)[3];
+      int filter_w = CudaNdarray_HOST_DIMS(*kerns)[4];
+
+      // Extract the spatial size of the input
+      int input_h = CudaNdarray_HOST_DIMS(input)[3];
+      int input_w = CudaNdarray_HOST_DIMS(input)[4];
+
+      // Ensure that the selected implementation supports the requested
+      // convolution. Fall back to a safe implementation otherwise.
+      if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
+          input_w > 1024 || (filter_h == 1 && filter_w == 1))
+      {
+        chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      }
+    }
+
+    // Infer required workspace size from the chosen implementation
+    err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
+                                                         APPLY_SPECIFIC(input),
+                                                         APPLY_SPECIFIC(output),
+                                                         desc,
+                                                         APPLY_SPECIFIC(kerns),
+                                                         chosen_algo,
+                                                         &worksize);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "GpuDnnConvGradW: error getting worksize: %s",
+                   cudnnGetErrorString(err));
+      return 1;
+    }
+
+    // Allocate workspace for the convolution
+    workspace = get_work_mem(worksize);
+    if (workspace == NULL && worksize != 0)
+      return 1;
+
+    // Perform the convolution
+    err = cudnnConvolutionBackwardFilter_v3(
+      _handle,
+      (void *)&alpha,
+      APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
+      APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
+      desc,
+      chosen_algo,
+      workspace, worksize,
+      (void *)&beta,
+      APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
+
+  }
+#else
  err = cudnnConvolutionBackwardFilter(
    _handle,
    (void *)&alpha,
@@ -39,6 +240,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
    desc,
    (void *)&beta,
    APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
+#endif
+
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
                 cudnnGetErrorString(err));

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -452,7 +452,8 @@ def test_default_conv():
                for a in f.maker.fgraph.apply_nodes])


-def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
+def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
+               test_bigger_kernels=True):
    seed_rng()
    shapes = get_basic_shapes()
    shapes += get_shapes2()
@@ -481,14 +482,18 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
            , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1))  # a big one
            , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1))  # MNIST LeNET layer 1
            , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1))  # layer 1 backprop to weights
+            ]

-        # other test
-            , ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+    if test_bigger_kernels:
+        # Shapes where the kernel is larger than the image in some dimension
+        shapes += [
+              ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))
            , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
            , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
-            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))  # kernel bigger then image
+            , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))
            , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
            ]
+
    shapes += [
 #        ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
 #            , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
@@ -516,9 +521,16 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):


 def test_full():
-    for t in _test_full(None,
-                        mode=theano_mode,
-                        version=[-1]):
+
+    # If using CuDNN version before v3, only run the tests where the
+    # kernels are not larger than the input in any spatial dimension.
+    if cuda.dnn.dnn_available() and cuda.dnn.version() < (3000, 3000):
+        test_bigger_kernels = False
+    else:
+        test_bigger_kernels = True
+
+    for t in _test_full(None, mode=theano_mode, version=[-1],
+                        test_bigger_kernels=test_bigger_kernels):
        yield t


@@ -531,7 +543,16 @@ def test_gemm_full():
 def test_dnn_full():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
-    for t in _test_full(DnnBase, mode=theano_mode.including("cudnn")):
+
+    # If using CuDNN version before v3, only run the tests where the
+    # kernels are not larger than the input in any spatial dimension.
+    if cuda.dnn.version() < (3000, 3000):
+        test_bigger_kernels = False
+    else:
+        test_bigger_kernels = True
+
+    for t in _test_full(DnnBase, mode=theano_mode.including("cudnn"),
+                        test_bigger_kernels=test_bigger_kernels):
        yield t



--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -2,7 +2,7 @@ import logging

 from nose.plugins.skip import SkipTest
 import numpy
-from itertools import product
+from itertools import chain, product

 import theano
 from six import StringIO
@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
 from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
 import theano.sandbox.cuda.dnn as dnn
 from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
+from theano.sandbox.cuda import float32_shared_constructor as shared

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
@@ -160,8 +161,28 @@ def test_dnn_conv_inplace():
    assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2


-def pool_2d_i2n(input, ds=(2, 2), strides=None,
-                pad=(0, 0),
+def pool3d2d(input, ds=(2, 2, 2), strides=None, pad=(0, 0, 0),
+             pool_func=T.max, mode='ignore_borders'):
+    if strides is None:
+        strides = ds
+
+    shape = input.shape
+
+    # resahpe to B, C*0, 1, 2 and do the pooling on 1, 2
+    first = input.reshape((shape[0], shape[1] * shape[2], shape[3], shape[4]))
+    pooled1 = pool_2d_i2n(first, ds=ds[1:], strides=strides[1:], pad=pad[1:],
+                          pool_function=pool_func, mode=mode)
+
+    shp1 = pooled1.shape
+    # reshape to B, C, 0, 1*2 and do the pooling on 0
+    second = pooled1.reshape((shape[0], shape[1], shape[2], shp1[2] * shp1[3]))
+    pooled2 = pool_2d_i2n(second, ds=(ds[0], 1), strides=(strides[0], 1),
+                          pad=(pad[0], 0), pool_function=pool_func, mode=mode)
+    shp2 = pooled2.shape
+    return pooled2.reshape((shape[0], shape[1], shp2[2], shp1[2], shp1[3]))
+
+
+def pool_2d_i2n(input, ds=(2, 2), strides=None, pad=(0, 0),
                pool_function=T.max, mode='ignore_borders'):
    if strides is None:
        strides = ds
@@ -301,6 +322,111 @@ def test_pooling():
            assert numpy.allclose(c_out, g_out)


+def test_pooling3d():
+    # CuDNN 3d pooling requires CuDNN v3. Don't test if the CuDNN version is
+    # too old.
+    if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+
+    x = T.TensorType(broadcastable=(False, False, False, False, False),
+                     dtype='float32')()
+    for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
+                             ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
+                              (2, 3, 2), (3, 2, 2), (2, 2, 3))):
+        if mode == 'max':
+            func = T.max
+        else:
+            func = T.mean
+        if pad != (0, 0, 0) and cuda.dnn.version() == -1:
+            continue
+
+        if pad != (0, 0, 0) and func is T.mean:
+            continue
+
+        for ws in (4, 2, 5):
+            for stride in (2, 3):
+                if stride > ws:
+                    continue
+                if pad[0] > stride or pad[1] > stride or pad[2] > stride:
+                    # Not implemented
+                    continue
+                out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws),
+                                         stride=(stride, stride, stride),
+                                         pad=pad, mode=mode)
+                out2 = pool3d2d(x, ds=(ws, ws, ws),
+                                strides=(stride, stride, stride),
+                                pad=pad, pool_func=func)
+
+                # For max pooling pool3d2d explicitly pads the input with
+                # -inf. Because of this, the compilation mode for the function
+                # that uses pool3d2d should not check for infinite values or
+                # it will falsely believe there is a error in the graph.
+                mode_without_gpu2 = mode_without_gpu.including()
+                mode_without_gpu2.check_isfinite = False
+
+                f1 = theano.function([x], out1, mode=mode_with_gpu)
+                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                            for node in f1.maker.fgraph.apply_nodes])
+                f2 = theano.function([x], out2, mode=mode_without_gpu2)
+                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
+                                for node in f2.maker.fgraph.apply_nodes])
+                for shp in [(1, 10, 100, 100, 100),
+                            (1, 3, 99, 99, 99),
+                            (32, 1, 147, 197, 37),
+                            ]:
+                    data = numpy.random.normal(0, 1, shp).astype("float32")
+                    a = f1(data).__array__()
+
+                    b = f2(data).__array__()
+
+                    utt.assert_allclose(a, b,
+                                        atol=numpy.finfo(numpy.float32).eps)
+
+        # Test the grad
+        for shp in [(1, 1, 2, 2, 2),
+                    (1, 1, 3, 3, 3),
+                    (1, 1, 3, 3, 4),
+                    (1, 1, 3, 4, 3),
+                    (1, 1, 4, 3, 3),
+                    (1, 1, 4, 4, 4),
+                    (1, 1, 5, 5, 5)]:
+            data = numpy.random.normal(0, 1, shp).astype("float32") * 10
+
+            ws = 2
+            stride = 2
+            if pad[0] > stride or pad[1] > stride or pad[2] > stride:
+                # Not implemented
+                continue
+
+            # Test the GPU grad + GPU implementation
+            def fn(x):
+                dnn_op = cuda.dnn.dnn_pool(
+                    x, ws=(ws, ws, ws),
+                    stride=(stride, stride, stride),
+                    pad=pad,
+                    mode=mode)
+                return dnn_op
+            theano.tests.unittest_tools.verify_grad(
+                fn, [data],
+                cast_to_output_type=False,
+                mode=mode_with_gpu)
+            # Confirm that we get the good op.
+            fg = theano.function([x], theano.grad(fn(x).sum(), x),
+                                 mode=mode_with_gpu)
+            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
+                        for node in fg.maker.fgraph.toposort()])
+            g_out = fg(data)
+
+            # Compare again the CPU result
+            out = pool3d2d(x, (ws, ws, ws),
+                           strides=(stride, stride, stride),
+                           pad=pad, pool_func=func)
+            fc = theano.function([x], theano.grad(out.sum(), x),
+                                 mode=mode_without_gpu)
+            c_out = fc(data)
+            assert numpy.allclose(c_out, g_out)
+
+
 def test_pooling_opt():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
@@ -324,6 +450,45 @@ def test_pooling_opt():
                for n in f.maker.fgraph.toposort()])


+def test_log_softmax():
+    # This is a test for an optimization that depends on CuDNN v3 or
+    # more recent. Don't test if the CuDNN version is too old.
+    if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
+        raise SkipTest(cuda.dnn.dnn_available.msg)
+
+    x = T.ftensor4()
+    softmax_out = dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel')(x)
+    log_out = T.log(T.as_tensor_variable(softmax_out))
+
+    f = theano.function([x], log_out, mode=mode_with_gpu)
+
+    # Ensure that the optimization has been applied
+    dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                         isinstance(n.op, cuda.dnn.GpuDnnSoftmax)]
+    assert len(dnn_softmax_nodes) == 1
+    assert dnn_softmax_nodes[0].op.algo == "log"
+
+    # Ensure that the output of the function is valid
+    input_shapes = [(3, 4, 5, 6),
+                    (1025, 2, 3, 4),
+                    (2, 1025, 3, 4),
+                    (2, 3, 1025, 4),
+                    (2, 3, 4, 1025),
+                    (66000, 2, 3, 4),
+                    (2, 66000, 3, 4),
+                    (2, 3, 66000, 4),
+                    (2, 3, 4, 66000)]
+
+    for inp_shape in input_shapes:
+        input_val = numpy.random.normal(0, 1, inp_shape).astype("float32")
+
+        out = f(input_val)
+        expected_out = numpy.log(numpy.exp(input_val) /
+                                 numpy.exp(input_val).sum(1)[:, None, :, :])
+
+        utt.assert_allclose(out, expected_out)
+
+
 def test_dnn_tag():
    """
    Test that if cudnn isn't avail we crash and that if it is avail, we use it.
@@ -432,6 +597,45 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConv
            )

+    def test_conv3d(self):
+        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
+            raise SkipTest('"CuDNN 3D convolution requires CuDNN v2')
+        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
+        img = ftensor5('img')
+        kerns = ftensor5('kerns')
+        out = ftensor5('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(7, 2, 6, 4, 11),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(8, 2, 4, 3, 1),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid', 'full'],
+            [(1, 1, 1), (2, 2, 2)],
+            ['conv', 'cross']
+        ):
+            out_vals = numpy.zeros(
+                dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape,
+                                               border_mode=params[0],
+                                               subsample=params[1]),
+                dtype='float32')
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(img.shape, kerns.shape)
+            conv = dnn.GpuDnnConv3d()(img, kerns, out, desc)
+            self._compile_and_check(
+                [img, kerns, out],
+                [conv],
+                [img_val, kern_vals, out_vals],
+                dnn.GpuDnnConv3d
+            )
+
    def test_conv_gradw(self):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)
@@ -481,6 +685,51 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConvGradW
            )

+    def test_conv3d_gradw(self):
+        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
+            raise SkipTest('"CuDNN 3D convolution requires CuDNN v2')
+        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
+        img = ftensor5('img')
+        kerns = ftensor5('kerns')
+        out = ftensor5('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(9, 2, 4, 8, 7),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(11, 2, 3, 1, 4),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid', 'full'],
+            [(1, 1, 1), (2, 2, 2)],
+            ['conv', 'cross']
+        ):
+            out_vals = numpy.zeros(
+                dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape,
+                                               border_mode=params[0],
+                                               subsample=params[1]),
+                dtype='float32')
+
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(img.shape, out.shape)
+            conv_grad_w = dnn.GpuDnnConv3dGradW()(
+                img,
+                out,
+                kerns,
+                desc,
+            )
+            self._compile_and_check(
+                [img, out, kerns],
+                [conv_grad_w],
+                [img_val, out_vals, kern_vals],
+                dnn.GpuDnnConv3dGradW
+            )
+
    def test_conv_gradi(self):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)
@@ -492,7 +741,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
            dtype='float32'
        )
        kern_vals = numpy.asarray(
-            numpy.random.rand(13, 14, 15, 16),
+            numpy.random.rand(4, 14, 15, 16),
            dtype='float32'
        )

@@ -526,6 +775,51 @@ class TestDnnInferShapes(utt.InferShapeTester):
                dnn.GpuDnnConvGradI
            )

+    def test_conv3d_gradi(self):
+        if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
+            raise SkipTest('"CuDNN 3D convolution requires CuDNN v2')
+        ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5)
+        img = ftensor5('img')
+        kerns = ftensor5('kerns')
+        out = ftensor5('out')
+        img_val = numpy.asarray(
+            numpy.random.rand(8, 4, 6, 7, 5),
+            dtype='float32'
+        )
+        kern_vals = numpy.asarray(
+            numpy.random.rand(9, 4, 5, 1, 2),
+            dtype='float32'
+        )
+
+        for params in product(
+            ['valid', 'full'],
+            [(1, 1, 1), (2, 2, 2)],
+            ['conv', 'cross']
+        ):
+            out_vals = numpy.zeros(
+                dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape,
+                                               border_mode=params[0],
+                                               subsample=params[1]),
+                dtype='float32')
+
+            desc = dnn.GpuDnnConvDesc(
+                border_mode=params[0],
+                subsample=params[1],
+                conv_mode=params[2]
+            )(img.shape, kerns.shape)
+            conv_grad_i = dnn.GpuDnnConv3dGradI()(
+                kerns,
+                out,
+                img,
+                desc,
+            )
+            self._compile_and_check(
+                [kerns, out, img],
+                [conv_grad_i],
+                [kern_vals, out_vals, img_val],
+                dnn.GpuDnnConv3dGradI
+            )
+
    def test_pool(self):
        if not dnn.dnn_available():
            raise SkipTest(dnn.dnn_available.msg)
@@ -763,6 +1057,202 @@ def test_dnn_conv_grad():
    utt.verify_grad(dconvw, [img_val, kern_val, out_val])


+def get_conv3d_test_cases():
+    # Every element of test_shapes follows the format
+    # [input_shape, filter_shape, subsample]
+    test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)],
+                   [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
+                   # Test with 1x1x1 filters
+                   [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)],
+                   # Test with dimensions larger than 1024 (thread block dim)
+                   [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)],
+                   [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)],
+                   [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)],
+                   [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)],
+                   [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)],
+                   # The equivalent of this caused a crash with conv2d
+                   [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)]]
+
+    # With border mode 'full', test with kernel bigger than image in some/all
+    # dimensions
+    test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)],
+                        [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]]
+    border_modes = ['valid', 'full', (1, 2, 3), (3, 2, 1), 1, 2]
+    conv_modes = ['conv', 'cross']
+
+    if cuda.dnn.dnn_available() and dnn.version() >= (3000, 3000):
+        itt = chain(product(test_shapes, border_modes, conv_modes),
+                    product(test_shapes_full, ['full'], conv_modes))
+    else:
+        # CuDNN, before V3, did not support kernels larger than the inputs,
+        # even if the original inputs were padded so they would be larger than
+        # the kernels. If using a version older than V3 don't run the tests
+        # with kernels larger than the unpadded inputs.
+        itt = product(test_shapes, border_modes, conv_modes)
+
+    return itt
+
+
+def test_conv3d_fwd():
+
+    if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
+        raise SkipTest('"CuDNN 3D convolution requires CuDNN v2')
+
+    def run_conv3d_fwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        # Scale down the input values to prevent very large absolute errors
+        # due to float rounding
+        inputs_val /= 10
+        filters_val /= 10
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+
+        # Compile a theano function for the CuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+        f = theano.function([], conv, mode=mode_with_gpu)
+
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = shared(padded_inputs_val)
+
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
+
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref, res)
+
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
+
+
+def test_conv3d_bwd():
+
+    if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)):
+        raise SkipTest('"CuDNN 3D convolution requires CuDNN v2')
+
+    def run_conv3d_bwd(inputs_shape, filters_shape, subsample,
+                       border_mode, conv_mode):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+
+        inputs = shared(inputs_val)
+        filters = shared(filters_val)
+        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
+
+        # Compile a theano function for the CuDNN implementation
+        conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
+                              border_mode=border_mode, subsample=subsample,
+                              conv_mode=conv_mode)
+
+        grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters])
+
+        f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu)
+
+        # If conv_mode is 'conv' the reference implementation should use
+        # filters filpped according to the width, height and time axis
+        if conv_mode == 'conv':
+            flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
+        else:
+            flipped_filters = filters
+
+        # If border mode is anything but 'valid', the reference implementation
+        # should operate on padded inputs
+        if border_mode == 'valid':
+            padded_inputs = inputs
+        else:
+            if border_mode == 'full':
+                pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
+            else:
+                if isinstance(border_mode, int):
+                    pad_per_dim = [border_mode] * 3
+                else:
+                    pad_per_dim = border_mode
+
+            pad_before_after = ([(0, 0), (0, 0)] +
+                                [(p, p) for p in pad_per_dim])
+            padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
+                                          'constant')
+            padded_inputs = shared(padded_inputs_val)
+
+        # Compile a theano function for the reference implementation
+        conv_ref = theano.tensor.nnet.conv3D(
+            V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
+            W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
+            b=bias, d=subsample)
+        (grad_padded_i_ref,
+         grad_w_ref) = theano.tensor.grad(conv_ref.sum(),
+                                          [padded_inputs, filters])
+
+        # Recover grad_i_ref from grad_padded_i_ref
+        if border_mode == 'valid':
+            grad_i_ref = grad_padded_i_ref
+        else:
+            shp = grad_padded_i_ref.shape
+            grad_i_ref = grad_padded_i_ref[
+                :, :,
+                pad_per_dim[0]:shp[2] - pad_per_dim[0],
+                pad_per_dim[1]:shp[3] - pad_per_dim[1],
+                pad_per_dim[2]:shp[4] - pad_per_dim[2]]
+
+        f_ref = theano.function([], [grad_i_ref, grad_w_ref])
+
+        # Compare the results of the two implementations
+        res_ref = f_ref()
+        res = f()
+        utt.assert_allclose(res_ref[0], res[0])
+        utt.assert_allclose(res_ref[1], res[1])
+
+    test_cases = get_conv3d_test_cases()
+    for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
+        yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode,
+               conv_mode)
+
+
 def test_version():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)