Merge pull request #3356 from abergeron/gpuarray_cudnnv3

cuDNN v3 support for gpuarray

Merge pull request #3356 from abergeron/gpuarray_cudnnv3
1ef9be9d · Pascal Lamblin · e617dc50 · 642446c5 · 1ef9be9d · 1ef9be9d
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.configparser import AddConfigVar, EnumStr
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

+import theano.sandbox.dnn_flags
+

 def dnn_available():
    if dnn_available.avail is None:
@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
            # exclusive mode, this cause bad detection.
            comp, out, err = NVCC_compiler.try_flags(
                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-                 "-I" + os.path.join(theano.config.cuda.root, 'include'),
-                 "-L" + os.path.join(theano.config.cuda.root, 'lib64')],
+                 "-I" + config.dnn.include_path,
+                 "-L" + config.dnn.library_path],
                preambule=preambule, body=body,
                try_run=False, output=True)

@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
    %(fail)s
 }
 }
-
        """ % dict(var=var, err=err, desc=desc, fail=fail)


@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp):
    def c_code_cache_version(self):
        return (2, version())

-
-AddConfigVar('dnn.conv.workmem',
-             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.workmem_bwd',
-             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_fwd',
-             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-
 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float32'))
 _one = constant(numpy.asarray(1.0, dtype='float32'))
-_ifour = constant(numpy.asarray(4, dtype='int32'))
-_ifive = constant(numpy.asarray(5, dtype='int32'))


 def ensure_float(val, default, name):
@@ -406,20 +378,6 @@ def ensure_float(val, default, name):
    return val


-def ensure_int(val, default, name):
-    if val is None:
-        return default.clone()
-    if not isinstance(val, Variable):
-        val = constant(val)
-    if hasattr(val, 'ndim') and val.ndim == 0:
-        val = as_scalar(val)
-    if not isinstance(val.type, theano.scalar.Scalar):
-        raise TypeError("%s: expected a scalar value" % (name,))
-    if not val.type.dtype == 'int32':
-        raise TypeError("%s: type is not int32" % (name,))
-    return val
-
-
 class GpuDnnConv(DnnBase, COp):
    """
    The forward convolution.
@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        dop = desc.owner.op
-        e_ndim = dop.get_ndim() + 2  # 4 or 5
+        if desc.owner is not None:
+            dop = desc.owner.op
+            e_ndim = dop.get_ndim() + 2  # 4 or 5

-        if img.type.ndim != e_ndim:
-            raise TypeError('img must be %dD tensor' % e_ndim)
+            if img.type.ndim != e_ndim:
+                raise TypeError('img must be %dD tensor' % e_ndim)

        return Apply(self, [img, desc], [img.type()])

@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        nd = desc.owner.op.get_ndim() + 2  # 4 or 5
-
        inp = as_cuda_ndarray_variable(inp)
-        if inp.type.ndim != nd:
-            raise TypeError('inp must be %dD tensor' % (nd,))
-
        inp_grad = as_cuda_ndarray_variable(inp_grad)
-        if inp_grad.type.ndim != nd:
-            raise TypeError('inp_grad must be %dD tensor' % (nd,))
-
        out = as_cuda_ndarray_variable(out)
-        if out.type.ndim != nd:
-            raise TypeError('out must be %dD tensor' % (nd,))
+
+        if desc.owner is not None:
+            nd = desc.owner.op.get_ndim() + 2  # 4 or 5
+
+            if inp.type.ndim != nd:
+                raise TypeError('inp must be %dD tensor' % (nd,))
+
+            if inp_grad.type.ndim != nd:
+                raise TypeError('inp_grad must be %dD tensor' % (nd,))
+
+            if out.type.ndim != nd:
+                raise TypeError('out must be %dD tensor' % (nd,))

        return Apply(self, [inp, out, inp_grad, desc],
                     [inp.type()])
@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set this to 'bc01'.
    algo
        'fast', 'accurate' or 'log' indicating whether, respectively, computations
 	should be optimized for speed, for accuracy, or if CuDNN should rather
@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase):
    __props__ = ('tensor_format', 'mode', 'algo')

    def __init__(self, tensor_format, algo, mode):
-        assert(tensor_format in ('bc01', 'b01c'))
+        if tensor_format != 'bc01':
+            raise ValueError(
+                "It was discovered that since December 2014, the "
+                "tensor_format parameter was ignored and the equivalent of "
+                "'bc01' is always used.  Since your code seems to be using "
+                "another value, this might have affected previous results "
+                "ran with this code.")
        DnnBase.__init__(self)
        self.tensor_format = tensor_format

@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set to 'bc01'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.
@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set to 'bc01'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.

--- a/theano/sandbox/dnn_flags.py
+++ b/theano/sandbox/dnn_flags.py
+"""
+This module contains the configuration flags for cudnn support.
+
+Those are shared between the cuda and gpuarray backend which is why
+they are in this file.
+"""
+import os.path
+
+from theano.configparser import AddConfigVar, EnumStr, StrParam
+from theano import config
+
+AddConfigVar('dnn.conv.workmem',
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
+
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
+#section support_code_apply
+
+int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
+                              cudnnConvolutionDescriptor_t *desc) {
+  cudnnStatus_t err;
+  int pad[3] = {PAD_0, PAD_1, PAD_2};
+  int strides[3] = {SUB_0, SUB_1, SUB_2};
+  int upscale[3] = {1, 1, 1};
+
+#if BORDER_MODE == 0
+  pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1;
+  pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1;
+#if NB_DIMS > 2
+  pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1;
+#endif
+#endif
+
+  if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
+    PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
+                 "expected %d, got %lld.", NB_DIMS,
+                 (long long)PyArray_DIM(filt_shp, 0));
+    return -1;
+  }
+
+  err = cudnnCreateConvolutionDescriptor(desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+                 "descriptor: %s", cudnnGetErrorString(err));
+    return -1;
+  }
+
+  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
+                                        CONV_MODE);
+  return 0;
+}
--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
@@ -4,193 +4,109 @@
 #include <cudnn.h>

 #ifndef CUDNN_VERSION
-#include <assert.h>
-
-// Here we define the R2 API in terms of functions in the R1 interface
-// This is only for what we use

-static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
-  switch (err) {
-  case CUDNN_STATUS_SUCCESS:
-    return "The operation completed successfully.";
-  case CUDNN_STATUS_NOT_INITIALIZED:
-    return "The handle was not initialized(Is your driver recent enought?).";
-  case CUDNN_STATUS_ALLOC_FAILED:
-    return "Ressource allocation failed inside the library.";
-  case CUDNN_STATUS_BAD_PARAM:
-    return "An incorrect value was passed in.";
-  case CUDNN_STATUS_ARCH_MISMATCH:
-    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
-  case CUDNN_STATUS_MAPPING_ERROR:
-    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
-  case CUDNN_STATUS_EXECUTION_FAILED:
-    return "A kernel failed to execute.";
-  case CUDNN_STATUS_INTERNAL_ERROR:
-    return "An internal cuDNN operation failed.";
-  case CUDNN_STATUS_NOT_SUPPORTED:
-    return "The combination of parameters is not currently supported.";
-  default:
-    return "Unknown error code.";
-  }
+#define CUDNN_VERSION -1
+static inline int cudnnGetVersion() {
+  return -1;
 }
+#endif

-// some macros to help support cudnn R1 while using R2 code.
-#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
-#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
-#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
-
-typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
+#include <assert.h>

-static inline cudnnStatus_t
-cudnnGetConvolution2dForwardOutputDim(
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t inputTensorDesc,
-  const cudnnFilterDescriptor_t filterDesc,
-  int *n,
-  int *c,
-  int *h,
-  int *w) {
-  return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
-				   n, c, h, w);
-}
+#if CUDNN_VERSION < 3000
+// Here we define the R3 API in terms of functions in the R2 interface
+// This is only for what we use

-typedef int cudnnConvolutionFwdAlgo_t;
-typedef int cudnnConvolutionFwdPreference_t;
+typedef int cudnnConvolutionBwdDataAlgo_t;

-#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2

-static inline cudnnStatus_t
-cudnnGetConvolutionForwardAlgorithm(
+static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
  cudnnHandle_t handle,
-  const cudnnTensorDescriptor_t srcDesc,
  const cudnnFilterDescriptor_t filterDesc,
+  const cudnnTensorDescriptor_t diffDesc,
  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t destDesc,
-  cudnnConvolutionFwdPreference_t preference,
-  size_t memoryLimitInbytes,
-  cudnnConvolutionFwdAlgo_t *algo) {
-  *algo = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-
-static inline cudnnStatus_t
-cudnnGetConvolutionForwardWorkspaceSize(
- cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensor4dDescriptor_t destDesc,
- cudnnConvolutionFwdAlgo_t algo,
- size_t *sizeInBytes) {
+  const cudnnTensorDescriptor_t gradDesc,
+  cudnnConvolutionBwdDataAlgo_t algo,
+  size_t *sizeInBytes) {
  *sizeInBytes = 0;
  return CUDNN_STATUS_SUCCESS;
 }

-
-static inline cudnnStatus_t
-cudnnConvolutionForward_v2(
+static cudnnStatus_t cudnnConvolutionBackwardData_v3(
  cudnnHandle_t handle,
  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
  const cudnnFilterDescriptor_t filterDesc,
  const void *filterData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionFwdAlgo_t algo,
-  void *workSpace,
-  size_t workSpaceSizeInBytes,
+  cudnnConvolutionBwdDataAlgo_t algo,
+  void *workspace,
+  size_t workspaceSizeInBytes,
  const void *beta,
-  const cudnnTensorDescriptor_t destDesc,
-  void *destData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionForward(handle, srcDesc, srcData,
-				 filterDesc, filterData,
-				 convDesc, destDesc, destData,
-				 r);
+  const cudnnTensorDescriptor_t gradDesc,
+  void *gradData) {
+  return cudnnConvolutionBackwardData(
+    handle,
+    alpha,
+    filterDesc,
+    filterData,
+    diffDesc,
+    diffData,
+    convDesc,
+    beta,
+    gradDesc,
+    gradData);
 }
-#define cudnnConvolutionForward cudnnConvolutionForward_v2

-static inline cudnnStatus_t
-cudnnConvolutionBackwardFilter_v2(
-  cudnnHandle_t	handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
+typedef int cudnnConvolutionBwdFilterAlgo_t;
+
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
+
+static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
+  cudnnHandle_t handle,
+  const cudnnTensorDescriptor_t filterDesc,
  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
-  const void *beta,
  const cudnnFilterDescriptor_t gradDesc,
-  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
-					diffDesc, diffData,
-					convDesc, gradDesc, gradData,
-					r);
+  cudnnConvolutionBwdDataAlgo_t algo,
+  size_t *sizeInBytes) {
+  *sizeInBytes = 0;
+  return CUDNN_STATUS_SUCCESS;
 }

-#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
-
-static inline cudnnStatus_t
-cudnnConvolutionBackwardData_v2(
-  cudnnHandle_t	handle,
+static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
+  cudnnHandle_t handle,
  const void *alpha,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
  const cudnnTensorDescriptor_t diffDesc,
  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
+  cudnnConvolutionBwdFilterAlgo_t algo,
+  void *workspace,
+  size_t workspaceSizeInBytes,
  const void *beta,
-  const cudnnTensorDescriptor_t gradDesc,
+  const cudnnFilterDescriptor_t gradDesc,
  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  /* This function needs the casting because its params are not
-     declared as const */
-  return cudnnConvolutionBackwardData(handle,
-				      (cudnnFilterDescriptor_t)filterDesc,
-				      filterData,
-				      (cudnnTensorDescriptor_t)diffDesc,
-				      diffData,
-				      (cudnnConvolutionDescriptor_t)convDesc,
-				      (cudnnTensorDescriptor_t)gradDesc,
-				      gradData,
-				      r);
+  return cudnnConvolutionBackwardFilter(
+    handle,
+    alpha,
+    srcDesc,
+    srcData,
+    diffDesc,
+    diffData,
+    convDesc,
+    beta,
+    gradDesc,
+    gradData);
 }

-#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
-
-//Needed for R2 rc2
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
-#else
-
-// r2 rc1 and rc2 do not have the same macro defined
-// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
-
 #endif

 #endif
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
 import os
 import numpy
+import warnings

 import theano
 from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import as_scalar, constant
+from theano.scalar import as_scalar, constant, Log
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.cmodule import GCC_compiler
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.configparser import AddConfigVar, EnumStr, StrParam
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
@@ -19,6 +19,7 @@ from . import pygpu, init_dev
 from .basic_ops import (as_gpuarray_variable,
                        gpu_contiguous, HostFromGpu,
                        GpuAllocEmpty, empty_like)
+from .elemwise import GpuElemwise
 from .conv import GpuConv

 # These don't exist in gpuarray
@@ -27,21 +28,8 @@ from .nnet import GpuSoftmax
 from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
 from .opt_util import alpha_merge, output_merge

-# This is to avoid conflict with the one in cuda/dnn.py
-if not hasattr(config, 'dnn'):
-    AddConfigVar('dnn.conv.workmem',
-                 "Default value for the workmem attribute of cudnn "
-                 "convolutions.",
-                 EnumStr('small', 'none', 'large'),
-                 in_c_key=False)
-
-AddConfigVar('dnn.include_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
-
-AddConfigVar('dnn.library_path',
-             "Location of the cudnn header (defaults to the cuda root)",
-             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
+# We need to import this to define the flags.
+from theano.sandbox import dnn_flags  # noqa


 def dnn_available():
@@ -57,7 +45,7 @@ def dnn_available():
        return False
    # This is a hack because bin_id is in the from of
    # "sm_<major><minor>" for cuda devices.
-    if pygpu.get_default_context().bin_id < 'sm_30':
+    if pygpu.get_default_context().bin_id[:-2] < '30':
        dnn_available.msg = "Device not supported by cuDNN"
        dnn_available.avail = False
    preambule = """
@@ -95,68 +83,26 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
    else:
        # If we can compile, check that we can import and run.
        v = version()
-        if isinstance(v, tuple) and v[0] != v[1]:
+        if v < 2000:
            dnn_available.avail = False
-            dnn_available.msg = ("Mixed dnn version. The header is"
-                                 " from one version, but we link with"
-                                 " a different version %s" % str(v))
+            dnn_available.msg = (
+                "You have an old release of CuDNN (or a release candidate) "
+                "that isn't supported.  Please update to at least v2 final "
+                "version.")
            raise RuntimeError(dnn_available.msg)
-        if version() == (20, 20):
+        if v >= 3000 and v < 3007:
            dnn_available.avail = False
            dnn_available.msg = (
-                "You have installed a release candidate of CuDNN v2."
-                " This isn't supported anymore."
-                " Update to CuDNN v2 final version.")
+                "You have installed a release candidate of CuDNN v3. This "
+                "isn't supported. Please update to v3 final version.")
            raise RuntimeError(dnn_available.msg)
-    return dnn_available.avail

+    return dnn_available.avail

 dnn_available.avail = None
 dnn_available.msg = None


-def c_set_tensor4d(var, desc, err, fail):
-    return """
-{
-  cudnnDataType_t dt;
-  size_t ds;
-  switch (%(var)s->ga.typecode) {
-  case GA_FLOAT:
-    dt = CUDNN_DATA_FLOAT;
-    break;
-  case GA_DOUBLE:
-    dt = CUDNN_DATA_DOUBLE;
-    break;
-  default:
-    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
-    return -1;
-  }
-  ds = gpuarray_get_elsize(%(var)s->ga.typecode);
-
-  int str0, str1, str2, str3;
-  // cudnn do not like 0s in strides
-  str3 = PyGpuArray_STRIDES(%(var)s)[3]?PyGpuArray_STRIDES(%(var)s)[3]/ds:1;
-  str2 = PyGpuArray_STRIDES(%(var)s)[2]?PyGpuArray_STRIDES(%(var)s)[2]/ds:PyGpuArray_DIMS(%(var)s)[3];
-  str1 = PyGpuArray_STRIDES(%(var)s)[1]?PyGpuArray_STRIDES(%(var)s)[1]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3];
-  str0 = PyGpuArray_STRIDES(%(var)s)[0]?PyGpuArray_STRIDES(%(var)s)[0]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3]*PyGpuArray_DIMS(%(var)s)[1];
-  %(err)s = cudnnSetTensor4dDescriptorEx(
-    %(desc)s, dt,
-    PyGpuArray_DIMS(%(var)s)[0],
-    PyGpuArray_DIMS(%(var)s)[1],
-    PyGpuArray_DIMS(%(var)s)[2],
-    PyGpuArray_DIMS(%(var)s)[3],
-    str0, str1, str2, str3);
-
-  if (%(err)s != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError,
-    "could not set tensor4d descriptor: %%s",
-    cudnnGetErrorString(%(err)s));
-    %(fail)s
-  }
-}
-        """ % dict(var=var, err=err, desc=desc, fail=fail)
-
-
 class DnnBase(COp):
    """
    Creates a handle for cudnn and pulls in the cudnn libraries and headers.
@@ -166,13 +112,15 @@ class DnnBase(COp):
    # the input broadcasting pattern.
    check_broadcast = False

-    def __init__(self):
-        COp.__init__(self, "dnn_base.c")
+    def __init__(self, files=None, c_func=None):
+        if files is None:
+            files = []
+        COp.__init__(self, ["dnn_base.c"] + files, c_func)

    def c_headers(self):
        return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
                'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
-                'gpuarray_api.h', 'numpy_compat.h']
+                'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']

    def c_header_dirs(self):
        return [os.path.dirname(__file__), pygpu.get_include(),
@@ -184,9 +132,11 @@ class DnnBase(COp):
    def c_lib_dirs(self):
        return [config.dnn.library_path]

+    def c_code_cache_version(self):
+        return (super(DnnBase, self).c_code_cache_version(), version())

-class DnnVersion(Op):

+class DnnVersion(Op):
    __props__ = ()

    def c_headers(self):
@@ -214,11 +164,7 @@ class DnnVersion(Op):
    def c_code(self, node, name, inputs, outputs, sub):
        o = outputs[0]
        return """
-        #if defined(CUDNN_VERSION)
        %(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
-        #else
-        %(o)s = PyInt_FromLong(-1);
-        #endif
        """ % locals()

    def do_constant_folding(self, node):
@@ -232,11 +178,9 @@ class DnnVersion(Op):

 def version():
    """
-    Return the current cuDNN version we compile with.
-
-    This return a tuple with the header version and the library version we link
-    with. For older cudnn version without version information, we return -1.
+    Return the current cuDNN version we link with.

+    This also does a check that the header version matches the runtime version.
    """
    if not dnn_available():
        raise Exception(
@@ -247,12 +191,16 @@ def version():
        f = theano.function([], DnnVersion()(),
                            theano.Mode(optimizer=None),
                            profile=False)
-        version.v = f()
+        v = f()
+        if v[0] != v[1]:
+            raise RuntimeError("Mixed dnn version. The header is version %s "
+                               "while the library is version %s." % v)
+        version.v = v[1]
    return version.v
 version.v = None


-class GpuDnnConvDesc(Op):
+class GpuDnnConvDesc(COp):
    """
    This Op builds a convolution descriptor for use in the other convolution
    operations.
@@ -275,12 +223,17 @@ class GpuDnnConvDesc(Op):
    def c_lib_dirs(self):
        return [config.dnn.library_path]

+    def do_constant_folding(self, node):
+        return False
+
    def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
+        COp.__init__(self, ["conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
+
        if isinstance(border_mode, int):
-            border_mode = (border_mode, border_mode)
+            border_mode = (border_mode,) * len(subsample)
        if isinstance(border_mode, tuple):
-            pad_h, pad_w = map(int, border_mode)
-            border_mode = (pad_h, pad_w)
+            assert len(border_mode) == len(subsample)
+            border_mode = tuple(map(int, border_mode))
        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
                border_mode in ('valid', 'full')):
            raise ValueError(
@@ -288,105 +241,56 @@ class GpuDnnConvDesc(Op):
                '"valid", "full", an integer or a pair of'
                ' integers'.format(border_mode))
        self.border_mode = border_mode
-        assert len(subsample) == 2
+        assert len(subsample) in (2, 3)
        self.subsample = subsample
        assert conv_mode in ('conv', 'cross')
        self.conv_mode = conv_mode

-    def make_node(self, img_shape, kern_shape):
-        if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
-            raise TypeError('img must be 1D shape tensor')
+    def make_node(self, kern_shape):
        if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
            raise TypeError('kern must be 1D shape tensor')

-        return Apply(self, [img_shape, kern_shape],
+        return Apply(self, [kern_shape],
                     [CDataType("cudnnConvolutionDescriptor_t",
                                freefunc="cudnnDestroyConvolutionDescriptor")()])

-    def c_code(self, node, name, inputs, outputs, sub):
-        img_shape, kern_shape = inputs
-        desc, = outputs
-
+    def get_op_params(self):
+        pad0 = '0'
+        pad1 = '0'
+        pad2 = '0'
        if isinstance(self.border_mode, tuple):
-            pad_h_spec, pad_w_spec = map(int, self.border_mode)
-            assert pad_h_spec >= 0 and pad_w_spec >= 0
-            bmode = 2
+            pad0 = str(self.border_mode[0])
+            pad1 = str(self.border_mode[1])
+            if len(self.border_mode) > 2:
+                pad2 = str(self.border_mode[2])
+            bmode = '2'
+        elif self.border_mode == "valid":
+            bmode = '1'
+        elif self.border_mode == "full":
+            bmode = '0'
        else:
-            pad_h_spec = pad_w_spec = 0
-
-            if self.border_mode == "valid":
-                bmode = 1
-            else:
-                assert self.border_mode == "full"
-                bmode = 0
+            raise ValueError("Invalid value for border_mode")

        if self.conv_mode == 'conv':
            conv_flag = 'CUDNN_CONVOLUTION'
        else:
            conv_flag = 'CUDNN_CROSS_CORRELATION'

-        return """
-{
-  cudnnStatus_t err;
-  int pad_h%(name)s;
-  int pad_w%(name)s;
-
-  if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
-                 "descriptor: %%s", cudnnGetErrorString(err));
-    %(fail)s
-  }
+        sub0 = str(self.subsample[0])
+        sub1 = str(self.subsample[1])
+        if len(self.subsample) > 2:
+            sub2 = str(self.subsample[2])
+        else:
+            sub2 = '0'

-  if (%(bmode)d == 2) {
-    pad_h%(name)s = %(pad_h_spec)d;
-    pad_w%(name)s = %(pad_w_spec)d;
-  } else if (%(bmode)d == 1) {
-    pad_h%(name)s = 0;
-    pad_w%(name)s = 0;
-  } else if (%(bmode)d == 0) {
-    pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
-    pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
-  } else {
-    PyErr_SetString(PyExc_ValueError, "bad border mode");
-    %(fail)s
-  }
-#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 20
-  err = cudnnSetConvolution2dDescriptor(
-  %(desc)s,
-  pad_h%(name)s,
-  pad_w%(name)s,
-  %(subsx)d, %(subsy)d, 1, 1,
-  %(conv_flag)s
-  );
-#else
-  err = cudnnSetConvolutionDescriptorEx(
-  %(desc)s,
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
-  *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
-  pad_h%(name)s,
-  pad_w%(name)s,
-  %(subsx)d, %(subsy)d, 1, 1,
-  %(conv_flag)s
-  );
-#endif
-  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
-                 cudnnGetErrorString(err));
-    %(fail)s
-  }
-}
-""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
-           bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
-           subsx=self.subsample[0], subsy=self.subsample[1],
-           pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
+        return [('NB_DIMS', str(len(self.subsample))),
+                ('BORDER_MODE', bmode),
+                ('PAD_0', pad0), ('PAD_1', pad1), ('PAD_2', pad2),
+                ('CONV_MODE', conv_flag),
+                ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)]

    def c_code_cache_version(self):
-        return (1, version())
+        return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())

 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float64'))
@@ -407,7 +311,7 @@ def ensure_dt(val, default, name, dtype):
    return val


-class GpuDnnConv(DnnBase, COp):
+class GpuDnnConv(DnnBase):
    """
    The forward convolution.

@@ -417,55 +321,97 @@ class GpuDnnConv(DnnBase, COp):
    kernel
    descr
        The convolution descriptor.
-    workmem
-        Either 'none', 'small' or 'large'. Default is the value of
-        :attr:`config.dnn.conv.workmem`.
+    algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+        Default is the value of :attr:`config.dnn.conv.algo_fwd`.

    """

-    __props__ = ('workmem', 'inplace')
+    __props__ = ('algo', 'inplace')
+
+    def __init__(self, algo=None, inplace=False):
+        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
+                         "APPLY_SPECIFIC(conv_fwd)")
+
+        if algo is None:
+            algo = config.dnn.conv.algo_fwd
+        self.algo = algo

-    def __init__(self, workmem=None, inplace=False):
-        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
-                     "APPLY_SPECIFIC(conv_fwd)")
-        if workmem is None:
-            workmem = config.dnn.conv.workmem
-        self.workmem = workmem
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
-        assert self.workmem in ['none', 'small', 'large']
+
+        if version() < 3000:
+            if self.algo == 'fft':
+                raise RuntimeError("CuDNN FFT convolution requires CuDNN v3")
+            elif self.algo in ['guess_once', 'guess_on_shape_change']:
+                raise RuntimeError("CuDNN selection of convolution "
+                                   "implementation based on heuristics "
+                                   "requires CuDNN v3")
+            elif self.algo in ['time_once', 'time_on_shape_change']:
+                raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
+
+        assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'algo'):
+            if hasattr(self, 'workmem'):
+                self.algo = self.workmem
+            else:
+                self.algo = config.dnn.conv.algo_fwd
+        if not hasattr(self, 'inplace'):
+            self.inplace = False

    def get_op_params(self):
+        defs = []
        if self.inplace:
-            inpl_def = [('CONV_INPLACE', '1')]
-        else:
-            inpl_def = []
-        if version() == -1:
-            alg_def = ('CONV_ALGO', "0")
-        else:
-            if self.workmem == 'none':
-                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
-            elif self.workmem == 'small':
-                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
-            elif self.workmem == 'large':
-                alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
-            alg_def = ('CONV_ALGO', alg)
-        return [alg_def] + inpl_def
+            defs.append(('CONV_INPLACE', '1'))
+
+        alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+        if self.algo == 'none':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
+        elif self.algo == 'small':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
+        elif self.algo == 'large':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
+        elif self.algo == 'fft':
+            alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
+        defs.append(('CONV_ALGO', alg))
+
+        if self.algo in ['guess_once', 'guess_on_shape_change',
+                         'time_once', 'time_on_shape_change']:
+            defs.append(('CHOOSE_ALGO', ''))
+        if self.algo in ['guess_once', 'time_once']:
+            defs.append(('CHOOSE_ONCE', ''))
+        if self.algo in ['time_once', 'time_on_shape_change']:
+            defs.append(('CHOOSE_TIME', ''))
+
+        return defs

    def make_node(self, img, kern, output, desc, alpha=None, beta=None):
        img = as_gpuarray_variable(img)
        kern = as_gpuarray_variable(kern)
        output = as_gpuarray_variable(output)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if output.type.ndim != 4:
-            raise TypeError('output must be a 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+        if img.type.ndim not in (4, 5):
+            raise TypeError('img must be 4D or 5D tensor')
+        if kern.type.ndim not in (4, 5):
+            raise TypeError('kern must be 4D or 5D tensor')
+        if output.type.ndim not in (4, 5):
+            raise TypeError('output must be a 4D or 5D tensor')
+
+        if (img.type.ndim != kern.type.ndim or
+                img.type.ndim != output.type.ndim):
+            raise TypeError("The number of dimensions of "
+                            "img, kern and output must match")
+
+        if img.type.ndim == 5 and self.algo == 'fft':
+            raise ValueError("convolution algo fft can't be used for "
+                             "3d convolutions")
+
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
            raise TypeError('desc must be cudnnConvolutionDescriptor_t')

        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
@@ -507,28 +453,47 @@ class GpuDnnConv(DnnBase, COp):
        kh = kshape[2]  # Height of each filter
        kw = kshape[3]  # Width of each filter

-        sh, sw = subsample
+        nd = len(subsample)
+
+        if nd > 2:
+            d = ishape[4]
+            kd = ishape[4]
+
+        sh = subsample[0]
+        sw = subsample[1]
+        if nd > 2:
+            sd = subsample[2]
+
        if border_mode == 'full':
            padh = kh - 1
            padw = kw - 1
+            if nd > 4:
+                padd = kd - 1
        elif isinstance(border_mode, tuple):
-            padh, padw = border_mode
+            padh = border_mode[0]
+            padw = border_mode[1]
+            if nd > 2:
+                padd = border_mode[2]
        else:
            assert border_mode == 'valid'
            padh = 0
            padw = 0
+            padd = 0

-        return (
-            b, nb,
-            (h + 2 * padh - kh) // sh + 1,
-            (w + 2 * padw - kw) // sw + 1
-        )
+        res = [b, nb,
+               (h + 2 * padh - kh) // sh + 1,
+               (w + 2 * padw - kw) // sw + 1]
+
+        if nd > 2:
+            res.append(d + 2 * padd - kd // sd + 1)
+
+        return res

    def infer_shape(self, node, shape):
        return [shape[2]]


-class GpuDnnConvGradW(DnnBase, COp):
+class GpuDnnConvGradW(DnnBase):
    """
    The convolution gradient with respect to the weights.

@@ -541,19 +506,27 @@ class GpuDnnConvGradW(DnnBase, COp):

    """

-    __props__ = ('inplace',)
+    __props__ = ('algo', 'inplace')

-    def __init__(self, inplace=False):
-        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
-                     "APPLY_SPECIFIC(conv_gw)")
+    def __init__(self, inplace=False, algo=None):
+        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
+                         "APPLY_SPECIFIC(conv_gw)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        if algo is None:
+            algo = config.dnn.conv.algo_bwd
+        self.algo = algo
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'inplace'):
            self.inplace = False
+        if not hasattr(self, 'algo'):
+            self.algo = config.dnn.conv.algo_bwd

    def grad(self, inp, grads):
        img, top, output, desc, alpha, beta = inp
@@ -574,24 +547,55 @@ class GpuDnnConvGradW(DnnBase, COp):
        return [[1], [1], [1], [0], [1], [1]]

    def get_op_params(self):
+        defs = []
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            defs.append(('CONV_INPLACE', '1'))
+
+        if version() < 3000:
+            alg = '0'
        else:
-            return []
+            alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
+            if self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
+            if self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
+
+            if self.algo in ['guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_ALGO', ''))
+            if self.algo in ['guess_once', 'time_once']:
+                defs.append(('CHOOSE_ONCE', ''))
+            if self.algo in ['time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_TIME', ''))
+
+        defs.append(('CONV_ALGO', alg))
+
+        return defs

    def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
        img = as_gpuarray_variable(img)
        topgrad = as_gpuarray_variable(topgrad)
        output = as_gpuarray_variable(output)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-        if output.type.ndim != 4:
-            raise TypeError('output must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+        if img.type.ndim not in (4, 5):
+            raise TypeError('img must be 4D or 5D tensor')
+        if topgrad.type.ndim not in (4, 5):
+            raise TypeError('topgrad must be 4D or 5D tensor')
+        if output.type.ndim not in (4, 5):
+            raise TypeError('output must be 4D or 5D tensor')
+
+        if (img.type.ndim != topgrad.type.ndim or
+                img.type.ndim != output.type.ndim):
+            raise TypeError("The number of dimensions of "
+                            "img, topgrad and output must match")
+
+        if img.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))
+
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
            raise TypeError('desc must be cudnnConvolutionDescriptor_t')

        alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
@@ -617,14 +621,27 @@ class GpuDnnConvGradI(DnnBase):

    """

-    __props__ = ('inplace',)
+    __props__ = ('algo', 'inplace',)

-    def __init__(self, inplace=False):
-        COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
-                     "APPLY_SPECIFIC(conv_gi)")
+    def __init__(self, inplace=False, algo=None):
+        DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
+                         "APPLY_SPECIFIC(conv_gi)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [2]}
+        if algo is None:
+            algo = config.dnn.conv.algo_bwd
+        self.algo = algo
+        assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
+                             'guess_on_shape_change', 'time_once',
+                             'time_on_shape_change']
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        if not hasattr(self, 'algo'):
+            self.algo = config.dnn.conv.algo_bwd
+        if not hasattr(self, 'inplace'):
+            self.inplace = False

    def grad(self, inp, grads):
        kerns, top, output, desc, alpha, beta = inp
@@ -645,24 +662,55 @@ class GpuDnnConvGradI(DnnBase):
        return [[1], [1], [1], [0], [1], [1]]

    def get_op_params(self):
+        defs = []
        if self.inplace:
-            return [('CONV_INPLACE', '1')]
+            defs.append(('CONV_INPLACE', '1'))
+
+        if version() < 3000:
+            alg = '0'
        else:
-            return []
+            alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+            if self.algo == 'none':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0'
+            if self.algo == 'deterministic':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1'
+            if self.algo == 'fft':
+                alg = 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT'
+
+            if self.algo in ['guess_once', 'guess_on_shape_change',
+                             'time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_ALGO', ''))
+            if self.algo in ['guess_once', 'time_once']:
+                defs.append(('CHOOSE_ONCE', ''))
+            if self.algo in ['time_once', 'time_on_shape_change']:
+                defs.append(('CHOOSE_TIME', ''))
+
+        defs.append(('CONV_ALGO', alg))
+
+        return defs

    def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
        kern = as_gpuarray_variable(kern)
        topgrad = as_gpuarray_variable(topgrad)
        output = as_gpuarray_variable(output)
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        if topgrad.type.ndim != 4:
-            raise TypeError('topgrad must be 4D tensor')
-        if output.type.ndim != 4:
-            raise TypeError('output must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
+        if kern.type.ndim not in (4, 5):
+            raise TypeError('kern must be 4D or 5D tensor')
+        if topgrad.type.ndim not in (4, 5):
+            raise TypeError('topgrad must be 4D or 5D tensor')
+        if output.type.ndim not in (4, 5):
+            raise TypeError('output must be 4D or 5D tensor')
+
+        if (kern.type.ndim != topgrad.type.ndim or
+                kern.type.ndim != output.type.ndim):
+            raise TypeError("The number of dimensions of "
+                            "kern, topgrad and output must match")
+
+        if kern.type.ndim == 5 and self.algo in ['fft', 'deterministic']:
+            raise ValueError("convolution algo %s can't be used for "
+                             "3d convolutions", (self.algo,))
+
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
            raise TypeError('desc must be cudnnConvolutionDescriptor_t')

        alpha = ensure_dt(alpha, _one, 'alpha', kern.dtype)
@@ -676,7 +724,8 @@ class GpuDnnConvGradI(DnnBase):


 def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
-             conv_mode='conv', direction_hint=None, workmem=None):
+             conv_mode='conv', direction_hint=None, workmem=None,
+             algo=None):
    """
    GPU convolution using cuDNN from NVIDIA.

@@ -700,22 +749,27 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    direction_hint
        Used by graph optimizers to change algorithm choice.
        By default, GpuDnnConv will be used to carry out the convolution.
-        If border_mode is 'valid', subsample is (1,1) and direction_hint is
+        If border_mode is 'valid', subsample is (1, 1) and direction_hint is
        'bprop weights', it will use GpuDnnConvGradW.
-        If border_mode is 'full', subsample is (1,1) and direction_hint is
+        If border_mode is 'full', subsample is (1, 1) and direction_hint is
        *not* 'forward!', it will use GpuDnnConvGradI.
        This parameter is used internally by graph optimizers and may be
        removed at any time without a deprecation period. You have been warned.
-    workmem
-        Specify the amount of working memory allowed. More memory is usually
-        faster.  One of 'none', 'small' or 'large' (default is None which takes
-        its value from :attr:`config.dnn.conv.workmem`).
+    algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
+        Convolution implementation to use. Some of its values may
+        require certain versions of CuDNN to be installed. Default is
+        the value of :attr:`config.dnn.conv.algo_fwd`.

-    .. warning:: The cuDNN library only works with GPU that have a compute
-        capability of 3.0 or higer.  This means that older GPU will not
+    .. warning:: The cuDNN library only works with GPUs that have a compute
+        capability of 3.0 or higer. This means that older GPUs will not
        work with this Op.

    """
+    if workmem is not None:
+        if algo is not None:
+            raise ValueError("You can't use both algo and workmem")
+        warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
+        algo = workmem
    fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
    if (border_mode == 'valid' and subsample == (1, 1) and
            direction_hint == 'bprop weights'):
@@ -732,7 +786,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph),
                                       shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode='cross')(img.shape, out.shape)
+                              conv_mode='cross')(out.shape)
        conv = GpuDnnConvGradW()(img, kerns, out, desc)
        return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3))

@@ -741,7 +795,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        # Special case: We can be faster by using GpuDnnConvGradI to compute
        # the full convolution as the backward pass of a valid convolution.
        # We just need to set up a suitable 'fake' valid convolution.
-        img = gpu_contiguous(img)  # cudnn v1 and v2 rc3 need contiguous data
+        img = gpu_contiguous(img)  # cudnn v2 rc3 need contiguous data
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
@@ -750,7 +804,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
                                       shape_i(kerns, 1, fgraph),
                                       shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                              conv_mode=conv_mode)(out.shape, kerns.shape)
+                              conv_mode=conv_mode)(kerns.shape)
        return GpuDnnConvGradI()(kerns, img, out, desc)

    # Standard case: We use GpuDnnConv with suitable padding.
@@ -759,13 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    img = gpu_contiguous(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
-                          conv_mode=conv_mode)(img.shape, kerns.shape)
+                          conv_mode=conv_mode)(kerns.shape)
    desc_op = desc.owner.op
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
    out = GpuAllocEmpty(img.dtype)(*out_shp)
-    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
+    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


 class GpuDnnPoolDesc(Op):
@@ -773,18 +827,18 @@ class GpuDnnPoolDesc(Op):
    This Op builds a pooling descriptor for use in the other
    pooling operations.

+    `ws`, `stride` and `pad` must have the same length.
+
    Parameters
    ----------
-    ws
-        Windows size.
-    stride
-        (dx, dy).
+    ws : tuple
+        Window size.
+    stride : tuple
+        (dx, dy) or (dx, dy, dz).
    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
-        The old deprecated name 'average' correspond to 'average_inc_pad'.
-    pad
-        (padX, padY) padding information.
-        padX is the size of the left and right borders,
-        padY is the size of the top and bottom borders.
+        The old deprecated name 'average' corresponds to 'average_inc_pad'.
+    pad : tuple
+        (padX, padY) or (padX, padY, padZ)

    """

@@ -810,14 +864,18 @@ class GpuDnnPoolDesc(Op):
            mode = 'average_inc_pad'
        assert mode in ('max', 'average_inc_pad', 'average_exc_pad')
        self.mode = mode
-        assert len(ws) == 2
+
+        assert len(ws) == len(stride) and len(stride) == len(pad)
+        assert len(ws) in (2, 3)
        self.ws = ws
-        assert len(stride) == 2
        self.stride = stride
-        assert len(stride) == 2
        self.pad = pad
-        if (pad[0] != 0 or pad[1] != 0) and version() == -1:
-            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
+
+        if self.get_ndim() == 3 and version() < 3000:
+            raise RuntimeError("CuDNN 3d pooling requires v3")
+
+    def get_ndim(self):
+        return len(self.ws)

    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -825,9 +883,6 @@ class GpuDnnPoolDesc(Op):
            self.pad = (0, 0)

    def make_node(self):
-        if self.pad != (0, 0) and version() == -1:
-            raise RuntimeError("CuDNN pooling with padding requires CuDNN v2")
-
        return Apply(self, [],
                     [CDataType("cudnnPoolingDescriptor_t",
                                freefunc="cudnnDestroyPoolingDescriptor")()])
@@ -841,8 +896,6 @@ class GpuDnnPoolDesc(Op):
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING'
        elif self.mode == "average_exc_pad":
            mode_flag = 'CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING'
-            if version() == -1:
-                raise Exception("cudnn v1 do not support average_exc_pad")
        else:
            raise NotImplementedError("Unsupported pooling model.")

@@ -855,22 +908,13 @@ class GpuDnnPoolDesc(Op):
                 "descriptor: %%s", cudnnGetErrorString(err));
    %(fail)s
  }
-#ifndef CUDNN_VERSION
-  err = cudnnSetPoolingDescriptor(
-  %(desc)s,
-  %(mode_flag)s,
-  %(wsX)d, %(wsY)d,
-  %(stridex)d, %(stridey)d
-  );
-#else
-  err = cudnnSetPooling2dDescriptor(
-  %(desc)s,
-  %(mode_flag)s,
-  %(wsX)d, %(wsY)d,
-  %(padX)d, %(padY)d,
-  %(stridex)d, %(stridey)d
-  );
-#endif
+
+  static const int win[%(nd)d] = {%(win)s};
+  static const int pad[%(nd)d] = {%(pad)s};
+  static const int str[%(nd)d] = {%(str)s};
+  err = cudnnSetPoolingNdDescriptor(
+    %(desc)s, %(mode_flag)s, %(nd)d,
+    win, pad, str);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
                 cudnnGetErrorString(err));
@@ -878,12 +922,12 @@ class GpuDnnPoolDesc(Op):
  }
 }
 """ % dict(name=name, desc=desc, mode_flag=mode_flag, fail=sub['fail'],
-           wsX=self.ws[0], wsY=self.ws[1],
-           stridex=self.stride[0], stridey=self.stride[1],
-           padX=self.pad[0], padY=self.pad[1])
+           nd=self.get_ndim(), win=', '.join(map(str, self.ws)),
+           pad=', '.join(map(str, self.pad)),
+           str=', '.join(map(str, self.stride)))

    def c_code_cache_version(self):
-        return (2, version())
+        return (3, version())


 class GpuDnnPool(DnnBase):
@@ -901,146 +945,36 @@ class GpuDnnPool(DnnBase):

    __props__ = ()

+    def __init__(self):
+        DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
+
    def make_node(self, img, desc):
        img = as_gpuarray_variable(img)
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')

-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
+        if desc.owner is not None:
+            e_ndim = desc.owner.op.get_ndim() + 2
+
+            if img.type.ndim != e_ndim:
+                raise TypeError('img must be %dD tensor' % (e_ndim,))
+
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        return Apply(self, [img, desc],
-                     [img.type()])
+        return Apply(self, [img, desc], [img.type()])

    def infer_shape(self, node, shape):
        desc = node.inputs[1].owner.op
-        kh, kw = desc.ws
-        sh, sw = desc.stride
-        padh, padw = desc.pad
-        return [(
-            shape[0][0],
-            shape[0][1],
-            (shape[0][2] + 2 * padh - kh) // sh + 1,
-            (shape[0][3] + 2 * padw - kw) // sw + 1
-        )]
-
-    def c_support_code_struct(self, node, name):
-        return """
-cudnnTensorDescriptor_t input%(name)s;
-cudnnTensorDescriptor_t output%(name)s;
-""" % dict(name=name)
-
-    def c_init_code_struct(self, node, name, sub):
-        return """
-cudnnStatus_t err%(name)s;
-input%(name)s = NULL;
-output%(name)s = NULL;
-if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(inp): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
-               "(out): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(name=name, fail=sub['fail'])
-
-    def c_cleanup_code_struct(self, node, name):
-        return """
-if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
-if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
-""" % dict(name=name)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        desc = inputs[1]
-        out, = outputs
-
-        set_in = c_set_tensor4d(inputs[0], "input" + str(name),
-                                'err' + name, sub['fail'])
-
-        set_out = c_set_tensor4d(out, "output" + str(name),
-                                 'err' + name, sub['fail'])
-
-        return """
-cudnnStatus_t err%(name)s;
-
-size_t %(out)s_dims[4];
-
-if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
-  PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-%(set_in)s
-
-cudnnPoolingMode_t mode;
-int wsX, wsY, vpad, hpad, strideX, strideY;
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnGetPoolingDescriptor(
-        %(desc)s, &mode,
-        &wsX, &wsY,
-        &strideX, &strideY);
-#else
-err%(name)s = cudnnGetPooling2dDescriptor(
-        %(desc)s, &mode,
-        &wsX, &wsY,
-        &vpad, &hpad,
-        &strideX, &strideY);
-#endif
-
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPool: error doing cudnnGetPoolingDescriptor operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-
-%(out)s_dims[0] = PyGpuArray_DIMS(%(input)s)[0];
-%(out)s_dims[1] = PyGpuArray_DIMS(%(input)s)[1];
-%(out)s_dims[2] = (PyGpuArray_DIMS(%(input)s)[2] + (vpad*2) - wsX) / strideX + 1;
-%(out)s_dims[3] = (PyGpuArray_DIMS(%(input)s)[3] + (hpad*2) - wsY) / strideY + 1;
-
-if (theano_prep_output(&%(out)s, 4, %(out)s_dims, %(input)s->ga.typecode,
-                       GA_C_ORDER, pygpu_default_context()) != 0) {
-  %(fail)s
-}
-
-%(set_out)s
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnPoolingForward(
-_handle,
-%(desc)s,
-%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
-%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
-);
-#else
-{
-const float alpha = 1;
-const float beta = 0;
-err%(name)s = cudnnPoolingForward(
-_handle,
-%(desc)s,
-&alpha,
-%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
-&beta,
-%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s)
-);
-}
-#endif
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(out=out, desc=desc, fail=sub['fail'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inputs[0],
-           input_desc="input" + name,
-           output_desc="output" + name)
+        w = desc.ws
+        s = desc.stride
+        p = desc.pad
+        res = [shape[0][0], shape[0][1],
+               (shape[0][2] + 2 * p[0] - w[0]) // s[0] + 1,
+               (shape[0][3] + 2 * p[1] - w[1]) // s[1] + 1
+               ]
+        if len(w) > 2:
+            res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
+        return [res]

    def grad(self, inp, grads):
        img, desc = inp
@@ -1058,9 +992,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
        # not connected to desc
        return [[1], [0]]

-    def c_code_cache_version(self):
-        return (7, version())
-

 class GpuDnnPoolGrad(DnnBase):
    """
@@ -1081,167 +1012,32 @@ class GpuDnnPoolGrad(DnnBase):

    __props__ = ()

-    def make_node(self, inp, out, inp_grad, desc):
-        inp = as_gpuarray_variable(inp)
-        if inp.type.ndim != 4:
-            raise TypeError('inp must be 4D tensor')
-
-        inp_grad = as_gpuarray_variable(inp_grad)
-        if inp_grad.type.ndim != 4:
-            raise TypeError('inp_grad must be 4D tensor')
+    def __init__(self):
+        DnnBase.__init__(self, ["dnn_pool_grad.c"],
+                         "APPLY_SPECIFIC(dnn_pool_grad)")

+    def make_node(self, inp, out, out_grad, desc):
+        inp = as_gpuarray_variable(inp)
+        out_grad = as_gpuarray_variable(out_grad)
        out = as_gpuarray_variable(out)
-        if out.type.ndim != 4:
-            raise TypeError('out must be 4D tensor')
-
-        if not isinstance(desc.type, CDataType) \
-                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
-            raise TypeError('desc must be cudnnPoolingDescriptor_t')
-
-        return Apply(self, [inp, out, inp_grad, desc],
-                     [inp.type()])
-
-    def c_support_code_struct(self, node, name):
-        return """
-cudnnTensorDescriptor_t input%(name)s;
-cudnnTensorDescriptor_t input_grad%(name)s;
-cudnnTensorDescriptor_t output%(name)s;
-cudnnTensorDescriptor_t output_grad%(name)s;
-""" % dict(name=name)
-
-    def c_init_code_struct(self, node, name, sub):
-        return """
-cudnnStatus_t err%(name)s;
-input%(name)s = NULL;
-input_grad%(name)s = NULL;
-output%(name)s = NULL;
-output_grad%(name)s = NULL;
-if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(input): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-if ((err%(name)s = cudnnCreateTensorDescriptor(&input_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(input_grad): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(output): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError,
-               "GpuDnnPoolGrad: could not allocate tensor4d descriptor "
-               "(output_grad): %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(name=name, fail=sub['fail'])
-
-    def c_cleanup_code_struct(self, node, name):
-        return """
-if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
-if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
-if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
-if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
-""" % dict(name=name)

-    def c_code(self, node, name, inputs, outputs, sub):
-        # Here the name out and inp are based on the cudnn definition.
-        # Not the definition of this class.
-        # This make it complicated.
-        out, inp, inp_grad, desc = inputs
-        out_grad, = outputs
-
-        set_in = "\n".join([
-            c_set_tensor4d(inp, "input" + name,
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(inp_grad, "input_grad" + name,
-                           'err' + name, sub['fail']),
-            c_set_tensor4d(out, "output" + name,
-                           'err' + name, sub['fail'])
-        ])
-
-        set_out = c_set_tensor4d(out, "output_grad" + name,
-                                 'err' + name, sub['fail'])
-
-        return """
-cudnnStatus_t err%(name)s;
-
-if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous inputs are supported.");
-  %(fail)s
-}
-
-if (!GpuArray_IS_C_CONTIGUOUS(&%(input_grad)s->ga)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous input gradients are supported.");
-  %(fail)s
-}
+        if desc.owner is not None:
+            nd = desc.owner.op.get_ndim() + 2

-if (!GpuArray_IS_C_CONTIGUOUS(&%(output)s->ga)) {
-  PyErr_SetString(PyExc_ValueError,
-                  "GpuDnnPoolGrad: Only contiguous outputs are supported.");
-  %(fail)s
-}
+            if inp.type.ndim != nd:
+                raise TypeError('inp must be %dD tensor' % (nd,))

-%(set_in)s
+            if out_grad.type.ndim != nd:
+                raise TypeError('out_grad must be %dD tensor' % (nd,))

-if (theano_prep_output(&%(output_grad)s, PyGpuArray_NDIM(%(output)s),
-                       PyGpuArray_DIMS(%(output)s), %(output)s->ga.typecode,
-                       GA_C_ORDER, pygpu_default_context()) != 0)
-{
-  %(fail)s
-}
+            if out.type.ndim != nd:
+                raise TypeError('out must be %dD tensor' % (nd,))

-%(set_out)s
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnPoolingBackward(
-_handle,
-%(desc)s,
-%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
-%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
-%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
-%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
-);
-#else
-{
-const float alpha = 1;
-const float beta = 0;
-err%(name)s = cudnnPoolingBackward(
-_handle,
-%(desc)s,
-&alpha,
-%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
-%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
-%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
-&beta,
-%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
-);
-}
-#endif
-if (err%(name)s != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_RuntimeError,
-               "GpuDnnPoolGrad: error doing operation: %%s.",
-               cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(output_grad=out_grad, desc=desc,
-           fail=sub['fail'],
-           name=name, set_in=set_in,
-           set_out=set_out, input=inp, input_grad=inp_grad, output=out,
-           input_desc="input" + name,
-           input_grad_desc="input_grad" + name,
-           output_desc="output" + name,
-           output_grad_desc="output_grad" + name)
+        if (not isinstance(desc.type, CDataType) or
+                desc.type.ctype != 'cudnnPoolingDescriptor_t'):
+            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-    def c_code_cache_version(self):
-        return (5, version())
+        return Apply(self, [inp, out, out_grad, desc], [inp.type()])

    def infer_shape(self, node, shape):
        return [shape[0]]
@@ -1254,19 +1050,20 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
    The memory layout to use is 'bc01', that is 'batch', 'channel',
    'first dim', 'second dim' in that order.

+    `ws`, `stride` and `pad` must have the same length.
+
    Parameters
    ----------
    img
        Images to do the pooling over.
-    ws
+    ws : tuple
        Subsampling window size.
-    stride
+    stride : tuple
        Subsampling stride (default: (1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad'}
-    pad
-        (padX, padY) padding information.
-        padX is the size of the left and right borders,
-        padY is the size of the top and bottom borders.
+    pad : tuple
+        (padX, padY) or (padX, padY, padZ)
+        default: (0, 0)

    .. warning:: The cuDNN library only works with GPU that have a compute
        capability of 3.0 or higer.  This means that older GPU will not
@@ -1288,8 +1085,6 @@ class GpuDnnSoftmaxBase(DnnBase):

    Parameters
    ----------
-    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.
@@ -1300,149 +1095,45 @@ class GpuDnnSoftmaxBase(DnnBase):

    """

-    __props__ = ('tensor_format', 'mode', 'algo')
+    __props__ = ('mode', 'algo')

-    def __init__(self, tensor_format, algo, mode):
-        assert(tensor_format in ('bc01', 'b01c'))
-        DnnBase.__init__(self)
-        self.tensor_format = tensor_format
+    def __init__(self, algo, mode):
+        DnnBase.__init__(self, [self.file], self.c_func)

-        assert(algo in ('fast', 'accurate'))
+        assert(algo in ('fast', 'accurate', 'log'))
+        if algo == 'log' and version() < 3000:
+            raise RuntimeError("Need CuDNN v3 for log-softmax")
        self.algo = algo

        assert(mode in ('instance', 'channel'))
        self.mode = mode

-        self.tensor_4d_descs = [softmax_input
-                                for softmax_input in self.softmax_inputs]
-        self.tensor_4d_descs.append('softmax_output')
-
    def infer_shape(self, node, shape):
        if self.direction == 'forward':
            return [shape[0]]
        else:
            return [shape[1]]

-    def _define_tensor4d_desc(self, name, id):
-        return """
-cudnnTensorDescriptor_t %(id)s_%(name)s;
-""" % dict(name=name, id=id)
-
-    def _init_tensor4d_desc(self, name, id, fail):
-        return """
-%(id)s_%(name)s = NULL;
-if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
-               ": %%s", cudnnGetErrorString(err%(name)s));
-  %(fail)s
-}
-""" % dict(name=name, id=id, fail=fail)
-
-    def _clean_tensor4d_desc(self, name, id):
-        return """
-if(%(id)s_%(name)s!= NULL)
-  cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
-""" % dict(name=name, id=id)
-
-    def c_support_code_struct(self, node, name):
-        result = ''
-        for id in self.tensor_4d_descs:
-            result += self._define_tensor4d_desc(name, id)
-        return result
-
-    def c_init_code_struct(self, node, name, sub):
-        result = """
-cudnnStatus_t err%(name)s;
-""" % dict(name=name)
-
-        for id in self.tensor_4d_descs:
-            result += self._init_tensor4d_desc(name, id, sub['fail'])
-        return result
-
-    def c_cleanup_code_struct(self, node, name):
-        result = ''
-        for id in self.tensor_4d_descs:
-            result += self._clean_tensor4d_desc(name, id)
-        return result
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        ins = inputs
-        outs, = outputs
-
-        if self.tensor_format == 'b01c':
-            tensor_format = 1
-        else:
-            tensor_format = 0
-
+    def get_op_params(self):
        if self.mode == 'instance':
-            mode = 1
+            mode = "CUDNN_SOFTMAX_MODE_INSTANCE"
        else:
-            mode = 0
+            mode = "CUDNN_SOFTMAX_MODE_CHANNEL"

        if self.algo == 'fast':
-            algo = 1
+            algo = "CUDNN_SOFTMAX_FAST"
+        elif self.algo == 'log':
+            algo = "CUDNN_SOFTMAX_LOG"
        else:
-            algo = 0
-
-        # Setup configuration variables.
-        result = """
-cudnnStatus_t err%(name)s;
-cudnnTensorFormat_t format%(name)s = CUDNN_TENSOR_NCHW;
-if (%(tensor_format)d == 1)
-  format%(name)s = CUDNN_TENSOR_NHWC;
-
-cudnnSoftmaxAlgorithm_t algo%(name)s = CUDNN_SOFTMAX_ACCURATE;
-if (%(algo)d == 1)
-  algo%(name)s = CUDNN_SOFTMAX_FAST;
-
-cudnnSoftmaxMode_t mode%(name)s = CUDNN_SOFTMAX_MODE_CHANNEL;
-if (%(mode)d == 1)
-  mode%(name)s = CUDNN_SOFTMAX_MODE_INSTANCE;
-""" % dict(name=name, tensor_format=tensor_format, mode=mode, algo=algo)
-
-        # Validate the input and build the input variables.
-        for input_idx, input_name in enumerate(self.softmax_inputs):
-            result += c_set_tensor4d(ins[input_idx], input_name + "_" + name,
-                                     "err" + name, sub['fail'])
-
-        subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
-                    name=name)
-
-        for idx, softmax_input in enumerate(self.softmax_inputs):
-            subs['name%d' % idx] = softmax_input
-            subs['ins%d' % idx] = inputs[idx]
-
-        # Build and prepare the output variable.
-        result += """
-if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
-                       PyGpuArray_DIMS(%(ins)s), %(ins)s->ga.typecode,
-                       GA_C_ORDER, pygpu_default_context()) != 0)
-{
-  %(fail)s
-}
-""" % subs
-        result += c_set_tensor4d(outs,
-                                 "softmax_output_" + name,
-                                 "err" + name, sub['fail'])
-
-        # Add on a call to the method that does the actual work.
-        result += self.method() % subs
-
-        return result
-
-    def c_code_cache_version(self):
-        return (0, 7, version())
+            algo = "CUDNN_SOFTMAX_ACCURATE"

-    def method(self):
-        raise NotImplementedError('GpuDnnSoftmaxBase::method')
+        return [("SOFTMAX_MODE", mode), ("SOFTMAX_ALGO", algo)]


 class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    """
    Op for the cuDNN Softmax.

-    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.
@@ -1452,55 +1143,23 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
        image across 'c'.

    """
-
-    direction = 'forward'
-    softmax_inputs = ['softmax_input']
+    direction = "forward"
+    file = "dnn_softmax.c"
+    c_func = "APPLY_SPECIFIC(softmax)"

    def make_node(self, x):
        x = as_gpuarray_variable(x)
        assert x.ndim == 4
        return Apply(self, [x], [x.type()])

-    def method(self):
-        return """
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnSoftmaxForward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  softmax_input_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins)s),
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-#else
-{
-const float alpha = 1.;
-const float beta = 0.;
-err%(name)s = cudnnSoftmaxForward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  (void*) &alpha,
-  softmax_input_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins)s),
-  (void*) &beta,
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-}
-#endif
-"""
-
    def grad(self, inp, grads):
        x, = inp
        g_sm, = grads
        sm = self.make_node(x).outputs[0]
        return [GpuDnnSoftmaxGrad(
-            self.tensor_format,
-            self.algo,
-            self.mode
-        )(g_sm, sm)]
+                self.algo,
+                self.mode
+                )(g_sm, sm)]


 class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
@@ -1509,8 +1168,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):

    Parameters
    ----------
-    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.
@@ -1521,7 +1178,8 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):

    """
    direction = 'backward'
-    softmax_inputs = ['softmax_gout', 'softmax_input']
+    file = "dnn_softmax_grad.c"
+    c_func = "APPLY_SPECIFIC(softmax_grad)"

    def make_node(self, dy, sm):
        dy = as_gpuarray_variable(dy)
@@ -1530,41 +1188,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        assert sm.ndim == 4
        return Apply(self, [dy, sm], [sm.type()])

-    def method(self):
-        return """
-#ifndef CUDNN_VERSION
-err%(name)s = cudnnSoftmaxBackward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  %(name1)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins1)s),
-  %(name0)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins0)s),
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-#else
-{
-const float alpha = 1.;
-const float beta = 0.;
-err%(name)s = cudnnSoftmaxBackward(
-  _handle,
-  algo%(name)s,
-  mode%(name)s,
-  (void*) &alpha,
-  %(name1)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins1)s),
-  %(name0)s_%(name)s,
-  PyGpuArray_DEV_DATA(%(ins0)s),
-  (void*) &beta,
-  softmax_output_%(name)s,
-  PyGpuArray_DEV_DATA(%(outs)s)
-);
-}
-#endif
-        """
-

 # @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
 @local_optimizer([GpuConv])
@@ -1612,9 +1235,6 @@ def local_conv_dnn_alternative(node):
        rval = dnn_conv(img, kern,
                        border_mode=border_mode, subsample=subsample,
                        direction_hint=direction_hint)
-        if node.outputs[0].broadcastable != rval.broadcastable:
-            rval = tensor.patternbroadcast(
-                rval, node.outputs[0].type.broadcastable)
        return [rval]


@@ -1632,7 +1252,7 @@ def local_dnn_conv_inplace(node):
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
-    return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]


 @local_optimizer([GpuDnnConvGradW], inplace=True)
@@ -1645,7 +1265,7 @@ def local_dnn_convgw_inplace(node):
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
-    return [GpuDnnConvGradW(inplace=True)(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]


 @local_optimizer([GpuDnnConvGradI], inplace=True)
@@ -1658,7 +1278,7 @@ def local_dnn_convgi_inplace(node):
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
-    return [GpuDnnConvGradI(inplace=True)(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]

 optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
@@ -1671,46 +1291,40 @@ optdb.register('local_dnna_conv_inplace',
 @register_opt('cudnn')
 @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
 def local_dnn_conv_alpha_merge(node, *inputs):
-    if not dnn_available() or version() == -1:
-        return None
-    return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
 def local_dnn_convw_alpha_merge(node, *inputs):
-    if not dnn_available() or version() == -1:
-        return None
-    return [GpuDnnConvGradW()(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
 @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
 def local_dnn_convi_alpha_merge(node, *inputs):
-    if not dnn_available() or version() == -1:
-        return None
-    return [GpuDnnConvGradI()(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
 @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_conv_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConv(workmem=node.op.workmem)(*inputs)]
+    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_convw_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradW()(*inputs)]
+    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
 @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
 def local_dnn_convi_output_merge(node, *inputs):
    inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
-    return [GpuDnnConvGradI()(*inputs)]
+    return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]


 @register_opt('cudnn')
@@ -1736,7 +1350,7 @@ def local_pool_dnn_grad_stride(node):
        return
    if not node.op.ignore_border:
        return
-    inp, out, inp_grad = node.inputs
+    inp, out, out_grad = node.inputs
    ds = node.op.ds
    st = node.op.st
    pad = node.op.padding
@@ -1745,7 +1359,7 @@ def local_pool_dnn_grad_stride(node):
    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
    return GpuDnnPoolGrad()(gpu_contiguous(inp),
                            gpu_contiguous(out),
-                            gpu_contiguous(inp_grad),
+                            gpu_contiguous(out_grad),
                            desc)


@@ -1756,18 +1370,19 @@ def local_avg_pool_dnn_grad_stride(node):
        return
    if not node.op.ignore_border:
        return
-    inp, inp_grad = node.inputs
+    inp, out_grad = node.inputs
    ds = node.op.ds
    st = node.op.st
    pad = node.op.padding
    mode = node.op.mode

+    cg = gpu_contiguous(out_grad)
+
    desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
-    contiguous_inp_grad = gpu_contiguous(inp_grad)
-    return GpuDnnPoolGrad()(gpu_contiguous(inp),
-                            contiguous_inp_grad,
-                            contiguous_inp_grad,
-                            desc)
+    # We reuse cg because CuDNN does not use the value of the `out`
+    # argument but still checks its shape for average pooling. This
+    # has been observed in v2 and v3 as far as I know.
+    return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)


 @register_opt('cudnn')
@@ -1778,11 +1393,27 @@ def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        ins = gpu_contiguous(ins)
-        out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
+        out = GpuDnnSoftmax('accurate', 'channel')(ins)
        out = as_gpuarray_variable(out.dimshuffle(0, 1))
        return [out]


+@register_opt('cudnn')
+@local_optimizer([GpuElemwise])
+def local_log_softmax_dnn(node):
+    if not dnn_available() or version() < 3000:
+        # No log-softmax before cudnn v3
+        return
+    if (isinstance(node.op, GpuElemwise) and
+            isinstance(node.op.scalar_op, Log) and
+            node.inputs[0].owner and
+            isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
+            len(node.inputs[0].clients) == 1):
+        softmax_node = node.inputs[0].owner
+        new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
+        return [new_softmax(softmax_node.inputs[0])]
+
+
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
        """
@@ -1813,6 +1444,6 @@ def local_softmax_dnn_grad(node):
            return
        ins.append(n.dimshuffle(0, 1, 'x', 'x'))

-    out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')(
+    out = GpuDnnSoftmaxGrad('accurate', 'channel')(
        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
    return [out.dimshuffle(0, 1)]
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
 #section support_code
-static cudnnHandle_t _handle = NULL;

 static int
-c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
+#if CUDNN_VERSION > 3000
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+#endif
  default:
-    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

-  int str0, str1, str2, str3;
-  // cudnn do not like 0s in strides
-  str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
-  str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
-  str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
-  str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
-  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
-    desc, dt,
-    PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
-    PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
-    str0, str1, str2, str3);
+  int strs[5], dims[5], default_stride = 1;
+  unsigned int nd = PyGpuArray_NDIM(var);
+
+  if (nd > 5) {
+    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
+    return -1;
+  }
+
+  for (unsigned int _i = nd; _i > 0; _i--) {
+    unsigned int i = _i - 1;
+    strs[i] = PyGpuArray_STRIDE(var, i) ?
+      PyGpuArray_STRIDE(var, i)/ds : default_stride;
+    default_stride *= PyGpuArray_DIM(var, i);
+    dims[i] = PyGpuArray_DIM(var, i);
+  }
+
+  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set tensor4d descriptor: %s",
+		 "Could not set tensorNd descriptor: %s",
 		 cudnnGetErrorString(err));
    return -1;
  }
@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
+#if CUDNN_VERSION > 3000
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+#endif
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
    return -1;
  }
-  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
-    desc, dt,
-    PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
-    PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
+
+  int dims[5];
+  unsigned int nd = PyGpuArray_NDIM(var);
+
+  if (nd > 5) {
+    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
+    return -1;
+  }
+
+  for (unsigned int _i = nd; _i > 0; _i--) {
+    unsigned int i = _i - 1;
+    dims[i] = PyGpuArray_DIM(var, i);
+  }
+
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s.",
@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {

 #section init_code

+setup_ext_cuda();
+
+#section support_code_struct
+
+cudnnHandle_t APPLY_SPECIFIC(_handle);
+
+#section init_code_struct
+
 {
+  cuda_enter(pygpu_default_context()->ctx);
  cudnnStatus_t err;
-  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  APPLY_SPECIFIC(_handle) = NULL;
+  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-		 cudnnGetErrorString(err));
-#if PY_MAJOR_VERSION >= 3
-    return NULL;
-#else
-    return;
-#endif
+                 cudnnGetErrorString(err));
+    cuda_exit(pygpu_default_context()->ctx);
+    FAIL;
  }
+  cuda_exit(pygpu_default_context()->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_conv_base.c
+++ b/theano/sandbox/gpuarray/dnn_conv_base.c
@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(kerns) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
 	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }

--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size");
+		    "images and kernel must have the same stack size");
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  Py_INCREF(*output);
 #else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
-                         om->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
 #endif

-  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

+  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+#ifdef CHOOSE_ALGO
+  /* Static variables are only initialized once so this will not
+   * reset the previous algo every time */
+  static int reuse_algo = 0;
+  static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_img_dims[5] = {0};
+  static size_t prev_kern_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionFwdAlgoPerf_t choice;
+    err = cudnnFindConvolutionForwardAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU: %s\n",
+                   cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionForwardAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      desc, APPLY_SPECIFIC(output),
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    prev_img_dims[i] = PyGpuArray_DIM(input, i);
+    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+  }
+#endif
+
+#endif
+
+  /* These two algos are not supported for 3d conv */
+  if (PyGpuArray_NDIM(input) == 5 &&
+      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+
+#if CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
+        (PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    }
+  }
+#endif
+
+#if CUDNN_VERSION < 3000
+  /* cuDNN before v3 does not support kernels larger than input even
+   * if appropriate padding is selected. */
+  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
+    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
+      PyErr_SetString(PyExc_RuntimeError, "the current version "
+                      "of CuDNN does not support kernels larger than the "
+                      "inputs in any spatial dimension, even if the inputs "
+                      "are padded such that the padded inputs are larger "
+                      "than the kernels. Update your installation of CuDNN "
+                      "to V3 or more recent to solve the issue.");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+#endif
+
  {
    size_t worksize;
    gpudata *workspace;
-    PyGpuContextObject *c;
-
-    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
-                                                  CONV_ALGO,
+                                                  algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv: error getting worksize: %s",
+                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

-    /* 
+    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
-      c = pygpu_default_context();
      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
+        cuda_exit(c->ctx);
        return 1;
      }
    }

    err = cudnnConvolutionForward(
-      _handle,
+      APPLY_SPECIFIC(_handle),
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-      desc, CONV_ALGO,
+      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (worksize != 0)
      c->ops->buffer_release(workspace);
  }
+  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
 		 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size");
+    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
+                    "stack size");
    return 1;
  }

-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  Py_INCREF(*input);
 #else
  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
-                         im->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         im->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*input, im))
    return 1;
 #endif

-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

-  err = cudnnConvolutionBackwardData(
-    _handle,
+  cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_kern_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdDataAlgoPerf_t choice;
+
+    err = cudnnFindConvolutionBackwardDataAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionBackwardDataAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+
+#endif
+
+#if CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
+        (PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    }
+  }
+#endif
+
+  size_t worksize;
+  gpudata *workspace;
+
+  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(input), algo, &worksize);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+
+  if (worksize != 0) {
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "Could not allocate working memory");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
+  err = cudnnConvolutionBackwardData_v3(
+    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
+
+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
 #section support_code_struct

-int 
+int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  Py_INCREF(*kerns);
 #else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
-                         km->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

-  err = cudnnConvolutionBackwardFilter(
-    _handle,
+  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_img_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    prev_img_dims[i] = PyGpuArray_DIM(input, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+
+#endif
+
+#ifdef CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
+        (PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    }
+  }
+#endif
+
+  size_t worksize;
+  gpudata *workspace;
+
+  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(kerns), algo, &worksize);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+    return 1;
+  }
+
+  if (worksize != 0) {
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
+  err = cudnnConvolutionBackwardFilter_v3(
+    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
+
+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
+                             cudnnPoolingDescriptor_t desc,
+                             PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  size_t dims[5];
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  cudnnPoolingMode_t mode;
+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims;
+
+  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "error doing cudnnGetPoolingDescriptor operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+
+  dims[0] = PyGpuArray_DIM(img, 0);
+  dims[1] = PyGpuArray_DIM(img, 1);
+  dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
+  dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
+  if (ndims == 3)
+    dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
+
+  if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (img->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingForward(
+      APPLY_SPECIFIC(_handle), desc,
+      alpha,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
+      beta,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
+    cuda_exit(c->ctx);
+  }
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "GpuDnnPool: error doing cudnnPoolingForward operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(input_grad) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(output_grad) = NULL;
+
+{
+  cudnnStatus_t err;
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
+                                  PyGpuArrayObject *out,
+                                  PyGpuArrayObject *out_grad,
+                                  cudnnPoolingDescriptor_t desc,
+                                  PyGpuArrayObject **inp_grad) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+  if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
+    return 1;
+  if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
+                         PyGpuArray_DIMS(inp), inp->ga.typecode,
+                         GA_C_ORDER, pygpu_default_context()) != 0) {
+    return 1;
+  }
+
+  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (inp->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingBackward(
+      APPLY_SPECIFIC(_handle), desc,
+      alpha,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
+      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
+      beta,
+      APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
+if (APPLY_SPECIFIC(output) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
+                            PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  if (theano_prep_output(out, PyGpuArray_NDIM(x),
+                         PyGpuArray_DIMS(x), x->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (x->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxForward(
+      APPLY_SPECIFIC(_handle),
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      alpha,
+      APPLY_SPECIFIC(input),
+      PyGpuArray_DEV_DATA(x),
+      beta,
+      APPLY_SPECIFIC(output),
+      PyGpuArray_DEV_DATA(*out)
+    );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(dy) = NULL;
+APPLY_SPECIFIC(sm) = NULL;
+APPLY_SPECIFIC(dx) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(dy) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
+if (APPLY_SPECIFIC(sm) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
+if (APPLY_SPECIFIC(dx) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
+                                 PyGpuArrayObject *sm,
+                                 PyGpuArrayObject **dx) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
+    return 1;
+  if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
+    return 1;
+
+  if (theano_prep_output(dx, PyGpuArray_NDIM(dy),
+                         PyGpuArray_DIMS(dy), dy->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (sm->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxBackward(
+      APPLY_SPECIFIC(_handle),
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      alpha,
+      APPLY_SPECIFIC(sm),
+      PyGpuArray_DEV_DATA(sm),
+      APPLY_SPECIFIC(dy),
+      PyGpuArray_DEV_DATA(dy),
+      beta,
+      APPLY_SPECIFIC(dx),
+      PyGpuArray_DEV_DATA(*dx)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
@@ -22,14 +22,12 @@ from . import test_nnet
 def test_dnn_conv_desc_merge():
    if not dnn.dnn_available():
        raise SkipTest(dnn.dnn_available.msg)
-    img_shp = T.as_tensor_variable(
-        numpy.asarray([2, 1, 8, 8]).astype('int64'))
    kern_shp = T.as_tensor_variable(
        numpy.asarray([3, 1, 2, 2]).astype('int64'))
    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(2, 2),
-                               conv_mode='conv')(img_shp, kern_shp)
+                               conv_mode='conv')(kern_shp)
    desc2 = dnn.GpuDnnConvDesc(border_mode='full', subsample=(1, 1),
-                               conv_mode='cross')(img_shp, kern_shp)
+                               conv_mode='cross')(kern_shp)
    # CDataType is not DeepCopyable so this will crash if we don't use
    # borrow=True
    f = theano.function([], [theano.Out(desc1, borrow=True),
@@ -51,7 +49,7 @@ def test_dnn_conv_merge():
    kern = T.ftensor4('kern')
    out = T.ftensor4('out')
    desc = dnn.GpuDnnConvDesc(
-        border_mode='valid')(img.shape, kern.shape)
+        border_mode='valid')(kern.shape)

    # Test forward op
    o1 = dnn.dnn_conv(img, kern)
@@ -90,9 +88,9 @@ def test_dnn_conv_inplace():
    kern = T.ftensor4('kern')
    out = T.ftensor4('out')
    desc1 = dnn.GpuDnnConvDesc(border_mode='valid', conv_mode='conv')(
-        img.shape, kern.shape)
+        kern.shape)
    desc2 = dnn.GpuDnnConvDesc(
-        border_mode='valid', conv_mode='cross')(img.shape, kern.shape)
+        border_mode='valid', conv_mode='cross')(kern.shape)

    # Test forward op
    o1 = dnn.dnn_conv(img, kern, conv_mode='conv')
@@ -175,8 +173,6 @@ def test_pooling():
            func = T.max
        else:
            func = T.mean
-        if pad != (0, 0) and dnn.version() == -1:
-            continue

        if pad != (0, 0) and func is T.mean:
            continue
@@ -209,11 +205,10 @@ def test_pooling():
                            (32, 1, 147, 197),
                            ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
-                    a = f1(data).__array__()
+                    a = f1(data)
+                    b = f2(data)

-                    b = f2(data).__array__()
-                    assert numpy.allclose(a, b,
-                                          atol=numpy.finfo(numpy.float32).eps)
+                    utt.assert_allclose(a, b)

        # Test the grad
        for shp in [(1, 1, 2, 2),
@@ -230,9 +225,9 @@ def test_pooling():
            def fn(x):
                return max_pool_2d(x, (ws, ws), ignore_border=True,
                                   padding=pad, mode=mode)
-            theano.tests.unittest_tools.verify_grad(fn, [data],
-                                                    cast_to_output_type=False,
-                                                    mode=mode_with_gpu)
+            utt.verify_grad(fn, [data],
+                            cast_to_output_type=False,
+                            mode=mode_with_gpu)
            # Confirm that the opt would have inserted it.
            fg = theano.function([x], theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
@@ -247,10 +242,9 @@ def test_pooling():
                    pad=pad,
                    mode=mode)
                return dnn_op
-            theano.tests.unittest_tools.verify_grad(
-                fn, [data],
-                cast_to_output_type=False,
-                mode=mode_with_gpu)
+            utt.verify_grad(fn, [data],
+                            cast_to_output_type=False,
+                            mode=mode_with_gpu)
            # Confirm that we get the good op.
            fg = theano.function([x], theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
@@ -258,7 +252,7 @@ def test_pooling():
                        for node in fg.maker.fgraph.toposort()])
            g_out = fg(data)

-            # Compare again the CPU result
+            # Compare against the CPU result
            out = max_pool_2d(x, (ws, ws),
                              padding=pad,
                              ignore_border=True, mode=mode)
@@ -271,7 +265,7 @@ def test_pooling():
                assert any([isinstance(node.op, AveragePoolGrad)
                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
-            assert numpy.allclose(c_out, g_out)
+            utt.assert_allclose(c_out, g_out)


 def test_pooling_opt():
@@ -353,7 +347,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
        )
        self._compile_and_check(
            [t],
-            [dnn.GpuDnnSoftmax('bc01', 'accurate', 'channel')(t)],
+            [dnn.GpuDnnSoftmax('accurate', 'channel')(t)],
            [rand_tensor],
            dnn.GpuDnnSoftmax
        )
@@ -363,7 +357,6 @@ class TestDnnInferShapes(utt.InferShapeTester):
            [
                T.grad(
                    dnn.GpuDnnSoftmax(
-                        'bc01',
                        'accurate',
                        'channel'
                    )(t).mean(),
@@ -403,7 +396,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
                border_mode=params[0],
                subsample=params[1],
                conv_mode=params[2]
-            )(img.shape, kerns.shape)
+            )(kerns.shape)
            conv = dnn.GpuDnnConv()(img, kerns, out, desc)
            self._compile_and_check(
                [img, kerns, out],
@@ -447,7 +440,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
                border_mode=params[0],
                subsample=params[1],
                conv_mode=params[2]
-            )(temp_img.shape, out.shape)
+            )(out.shape)
            conv_grad_w = dnn.GpuDnnConvGradW()(
                temp_img,
                temp_kerns,
@@ -467,42 +460,41 @@ class TestDnnInferShapes(utt.InferShapeTester):
        img = T.ftensor4('img')
        kerns = T.ftensor4('kerns')
        out = T.ftensor4('out')
-        img_val = numpy.asarray(
-            numpy.random.rand(3, 4, 5, 6),
-            dtype='float32'
-        )
        kern_vals = numpy.asarray(
            numpy.random.rand(13, 14, 15, 16),
            dtype='float32'
        )
+        out_vals = numpy.asarray(
+            numpy.random.rand(3, 13, 5, 6),
+            dtype='float32'
+        )

        for params in product(
            ['valid'],  # Should this work for 'full'?
            [(1, 1)],
            ['conv', 'cross']
        ):
-            temp_kerns = kerns.dimshuffle(1, 0, 2, 3)
            shape = (
-                img_val.shape[0], kern_vals.shape[1],
-                img_val.shape[2] + kern_vals.shape[2] - 1,
-                img_val.shape[3] + kern_vals.shape[3] - 1
+                out_vals.shape[0], kern_vals.shape[1],
+                out_vals.shape[2] + kern_vals.shape[2] - 1,
+                out_vals.shape[3] + kern_vals.shape[3] - 1
            )
-            out_vals = numpy.zeros(shape, dtype='float32')
+            img_vals = numpy.zeros(shape, dtype='float32')
            desc = dnn.GpuDnnConvDesc(
                border_mode=params[0],
                subsample=params[1],
                conv_mode=params[2]
-            )(out.shape, temp_kerns.shape)
+            )(kerns.shape)
            conv_grad_i = dnn.GpuDnnConvGradI()(
-                temp_kerns,
-                img,
+                kerns,
                out,
+                img,
                desc,
            )
            self._compile_and_check(
-                [temp_kerns, img, out],
+                [kerns, img, out],
                [conv_grad_i],
-                [kern_vals, img_val, out_vals],
+                [kern_vals, img_vals, out_vals],
                dnn.GpuDnnConvGradI
            )

@@ -612,15 +604,9 @@ def test_dnn_conv_alpha_output_merge():

    lr = numpy.asarray(0.05, dtype='float32')

-    if dnn.version() == -1:
-        # Can't merge alpha with cudnn v1
-        fr = conv + out
-        wr = kern + gw
-        ir = img + gi
-    else:
-        fr = lr * (conv + out)
-        wr = kern + lr * gw
-        ir = img + lr * gi
+    fr = lr * (conv + out)
+    wr = kern + lr * gw
+    ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
@@ -657,9 +643,6 @@ def test_dnn_conv_alpha_output_merge():


 def test_dnn_conv_grad():
-    if not dnn.dnn_available() or dnn.version() == -1:
-        raise SkipTest('alpha != 1.0 not supported in cudnn v1')
-
    b = 1
    c = 4
    f = 3
@@ -674,18 +657,18 @@ def test_dnn_conv_grad():

    def dconv(img, kern, out):
        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                                  conv_mode='conv')(img.shape, kern.shape)
+                                  conv_mode='conv')(kern.shape)
        return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75)

    def dconvi(img, kern, out):
        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                                  conv_mode='conv')(img.shape, kern.shape)
+                                  conv_mode='conv')(kern.shape)
        return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0,
                                     beta=0.0)

    def dconvw(img, kern, out):
        desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
-                                  conv_mode='conv')(img.shape, kern.shape)
+                                  conv_mode='conv')(kern.shape)
        return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75,
                                     beta=-1.0)

@@ -697,7 +680,7 @@ def test_dnn_conv_grad():
 def test_version():
    if not dnn.dnn_available():
        raise SkipTest(dnn.dnn_available.msg)
-    assert isinstance(dnn.version(), (int, tuple))
+    assert isinstance(dnn.version(), int)


 class test_SoftMax(test_nnet.test_SoftMax):
@@ -706,7 +689,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
    mode = mode_with_gpu

    def test_softmax_shape_0(self):
-        raise SkipTest("Cudnn do not suport 0 shapes")
+        raise SkipTest("Cudnn doesn't support 0 shapes")

    def test_softmax_grad(self):
        def cmp(n, m, f, f_gpu):
@@ -715,13 +698,12 @@ class test_SoftMax(test_nnet.test_SoftMax):

            out = f(data)
            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
-            assert numpy.allclose(out, gout), numpy.absolute(out - gout)
+            utt.assert_allclose(out, gout)

        x = T.matrix('x', 'float32')
        x_gpu = T.tensor4('x_gpu', 'float32')
        f_z = T.nnet.softmax_op
        f_gpu = dnn.GpuDnnSoftmax(
-            'bc01',
            'accurate',
            'channel'
        )
@@ -763,14 +745,14 @@ class test_SoftMax(test_nnet.test_SoftMax):
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        self.gpu_grad_op
-                    )]) == 1)
+                        self.gpu_grad_op)
+                    ]) == 1)
        assert(len([i
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 0)
+                        theano.tensor.nnet.SoftmaxGrad)
+                    ]) == 0)

        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
        # optimization is not applied when cudnn is excluded or not
@@ -787,14 +769,14 @@ class test_SoftMax(test_nnet.test_SoftMax):
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        self.gpu_grad_op
-                    )]) == 0)
+                        self.gpu_grad_op)
+                    ]) == 0)
        assert(len([i
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 1)
+                        theano.tensor.nnet.SoftmaxGrad)
+                    ]) == 1)

        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not
        # crash with manual graph
@@ -806,11 +788,49 @@ class test_SoftMax(test_nnet.test_SoftMax):
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        self.gpu_grad_op
-                    )]) == 1)
+                        self.gpu_grad_op)
+                    ]) == 1)
        assert(len([i
                    for i in sorted_f
                    if isinstance(
                        i.op,
-                        theano.tensor.nnet.SoftmaxGrad
-                    )]) == 0)
+                        theano.tensor.nnet.SoftmaxGrad)
+                    ]) == 0)
+
+    def test_log_softmax(self):
+        # This is a test for an optimization that depends on CuDNN v3 or
+        # more recent. Don't test if the CuDNN version is too old.
+        if dnn.version() < 3000:
+            raise SkipTest("Log-softmax is only in cudnn v3+")
+
+        x = T.ftensor4()
+        softmax_out = dnn.GpuDnnSoftmax('accurate', 'channel')(x)
+        log_out = T.log(T.as_tensor_variable(softmax_out))
+
+        f = theano.function([x], log_out, mode=mode_with_gpu)
+
+        # Ensure that the optimization has been applied
+        dnn_softmax_nodes = [n for n in f.maker.fgraph.toposort() if
+                             isinstance(n.op, dnn.GpuDnnSoftmax)]
+        assert len(dnn_softmax_nodes) == 1
+        assert dnn_softmax_nodes[0].op.algo == "log"
+
+        # Ensure that the output of the function is valid
+        input_shapes = [(3, 4, 5, 6),
+                        (1025, 2, 3, 4),
+                        (2, 1025, 3, 4),
+                        (2, 3, 1025, 4),
+                        (2, 3, 4, 1025),
+                        (66000, 2, 3, 4),
+                        (2, 66000, 3, 4),
+                        (2, 3, 66000, 4),
+                        (2, 3, 4, 66000)]
+
+        for inp_shape in input_shapes:
+            input_val = numpy.random.normal(0, 1, inp_shape).astype("float32")
+
+            out = f(input_val)
+            expected_out = numpy.log(numpy.exp(input_val) /
+                                     numpy.exp(input_val).sum(1)[:, None, :, :])
+
+            utt.assert_allclose(out, expected_out)
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase):
        return f, f_gpu

    def _cmp(self, n, m, f, f_gpu):
-        # print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase):
            self._cmp
        )

-        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
-        # we test them only for the Theano softmax.
        self._cmp(2 << 15, 5, f, f_gpu)

    def test_softmax_shape_0(self):