Merge pull request #3356 from abergeron/gpuarray_cudnnv3

cuDNN v3 support for gpuarray

Merge pull request #3356 from abergeron/gpuarray_cudnnv3
1ef9be9d · Pascal Lamblin · e617dc50 · 642446c5 · 1ef9be9d · 1ef9be9d
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -10,7 +10,6 @@ from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.configparser import AddConfigVar, EnumStr
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
@@ -28,6 +27,8 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt

 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

+import theano.sandbox.dnn_flags
+

 def dnn_available():
    if dnn_available.avail is None:
@@ -62,8 +63,8 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
            # exclusive mode, this cause bad detection.
            comp, out, err = NVCC_compiler.try_flags(
                ["-l", "cudnn", "-I" + os.path.dirname(__file__),
-                 "-I" + os.path.join(theano.config.cuda.root, 'include'),
-                 "-L" + os.path.join(theano.config.cuda.root, 'lib64')],
+                 "-I" + config.dnn.include_path,
+                 "-L" + config.dnn.library_path],
                preambule=preambule, body=body,
                try_run=False, output=True)

@@ -141,7 +142,6 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
    %(fail)s
 }
 }
-
        """ % dict(var=var, err=err, desc=desc, fail=fail)


@@ -359,37 +359,9 @@ class GpuDnnConvDesc(GpuOp):
    def c_code_cache_version(self):
        return (2, version())

-
-AddConfigVar('dnn.conv.workmem',
-             "This flag is deprecated; use dnn.conv.algo_fwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.workmem_bwd',
-             "This flag is deprecated; use dnn.conv.algo_bwd.",
-             EnumStr(''),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_fwd',
-             "Default implementation to use for CuDNN forward convolution.",
-             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-AddConfigVar('dnn.conv.algo_bwd',
-             "Default implementation to use for CuDNN backward convolution.",
-             EnumStr('none', 'deterministic', 'fft', 'guess_once',
-                     'guess_on_shape_change', 'time_once',
-                     'time_on_shape_change'),
-             in_c_key=False)
-
-
 # scalar constants
 _zero = constant(numpy.asarray(0.0, dtype='float32'))
 _one = constant(numpy.asarray(1.0, dtype='float32'))
-_ifour = constant(numpy.asarray(4, dtype='int32'))
-_ifive = constant(numpy.asarray(5, dtype='int32'))


 def ensure_float(val, default, name):
@@ -406,20 +378,6 @@ def ensure_float(val, default, name):
    return val


-def ensure_int(val, default, name):
-    if val is None:
-        return default.clone()
-    if not isinstance(val, Variable):
-        val = constant(val)
-    if hasattr(val, 'ndim') and val.ndim == 0:
-        val = as_scalar(val)
-    if not isinstance(val.type, theano.scalar.Scalar):
-        raise TypeError("%s: expected a scalar value" % (name,))
-    if not val.type.dtype == 'int32':
-        raise TypeError("%s: type is not int32" % (name,))
-    return val
-
-
 class GpuDnnConv(DnnBase, COp):
    """
    The forward convolution.
@@ -1448,11 +1406,12 @@ class GpuDnnPool(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        dop = desc.owner.op
-        e_ndim = dop.get_ndim() + 2  # 4 or 5
+        if desc.owner is not None:
+            dop = desc.owner.op
+            e_ndim = dop.get_ndim() + 2  # 4 or 5

-        if img.type.ndim != e_ndim:
-            raise TypeError('img must be %dD tensor' % e_ndim)
+            if img.type.ndim != e_ndim:
+                raise TypeError('img must be %dD tensor' % e_ndim)

        return Apply(self, [img, desc], [img.type()])

@@ -1616,19 +1575,21 @@ class GpuDnnPoolGrad(DnnBase):
                or desc.type.ctype != 'cudnnPoolingDescriptor_t':
            raise TypeError('desc must be cudnnPoolingDescriptor_t')

-        nd = desc.owner.op.get_ndim() + 2  # 4 or 5
-
        inp = as_cuda_ndarray_variable(inp)
-        if inp.type.ndim != nd:
-            raise TypeError('inp must be %dD tensor' % (nd,))
-
        inp_grad = as_cuda_ndarray_variable(inp_grad)
-        if inp_grad.type.ndim != nd:
-            raise TypeError('inp_grad must be %dD tensor' % (nd,))
-
        out = as_cuda_ndarray_variable(out)
-        if out.type.ndim != nd:
-            raise TypeError('out must be %dD tensor' % (nd,))
+
+        if desc.owner is not None:
+            nd = desc.owner.op.get_ndim() + 2  # 4 or 5
+
+            if inp.type.ndim != nd:
+                raise TypeError('inp must be %dD tensor' % (nd,))
+
+            if inp_grad.type.ndim != nd:
+                raise TypeError('inp_grad must be %dD tensor' % (nd,))
+
+            if out.type.ndim != nd:
+                raise TypeError('out must be %dD tensor' % (nd,))

        return Apply(self, [inp, out, inp_grad, desc],
                     [inp.type()])
@@ -1819,7 +1780,7 @@ class GpuDnnSoftmaxBase(DnnBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set this to 'bc01'.
    algo
        'fast', 'accurate' or 'log' indicating whether, respectively, computations
 	should be optimized for speed, for accuracy, or if CuDNN should rather
@@ -1834,7 +1795,13 @@ class GpuDnnSoftmaxBase(DnnBase):
    __props__ = ('tensor_format', 'mode', 'algo')

    def __init__(self, tensor_format, algo, mode):
-        assert(tensor_format in ('bc01', 'b01c'))
+        if tensor_format != 'bc01':
+            raise ValueError(
+                "It was discovered that since December 2014, the "
+                "tensor_format parameter was ignored and the equivalent of "
+                "'bc01' is always used.  Since your code seems to be using "
+                "another value, this might have affected previous results "
+                "ran with this code.")
        DnnBase.__init__(self)
        self.tensor_format = tensor_format

@@ -1976,7 +1943,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set to 'bc01'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.
@@ -2044,7 +2011,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
    Parameters
    ----------
    tensor_format
-        Whether the data format is 'bc01' or 'b01c'.
+        Always set to 'bc01'.
    algo
        'fast' or 'accurate' indicating whether computations should be
        optimized for speed or accuracy respectively.

--- a/theano/sandbox/dnn_flags.py
+++ b/theano/sandbox/dnn_flags.py
+"""
+This module contains the configuration flags for cudnn support.
+
+Those are shared between the cuda and gpuarray backend which is why
+they are in this file.
+"""
+import os.path
+
+from theano.configparser import AddConfigVar, EnumStr, StrParam
+from theano import config
+
+AddConfigVar('dnn.conv.workmem',
+             "This flag is deprecated; use dnn.conv.algo_fwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.workmem_bwd',
+             "This flag is deprecated; use dnn.conv.algo_bwd.",
+             EnumStr(''),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_fwd',
+             "Default implementation to use for CuDNN forward convolution.",
+             EnumStr('small', 'none', 'large', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.conv.algo_bwd',
+             "Default implementation to use for CuDNN backward convolution.",
+             EnumStr('none', 'deterministic', 'fft', 'guess_once',
+                     'guess_on_shape_change', 'time_once',
+                     'time_on_shape_change'),
+             in_c_key=False)
+
+AddConfigVar('dnn.include_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'include')))
+
+AddConfigVar('dnn.library_path',
+             "Location of the cudnn header (defaults to the cuda root)",
+             StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
--- a/theano/sandbox/gpuarray/conv_desc.c
+++ b/theano/sandbox/gpuarray/conv_desc.c
+#section support_code_apply
+
+int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
+                              cudnnConvolutionDescriptor_t *desc) {
+  cudnnStatus_t err;
+  int pad[3] = {PAD_0, PAD_1, PAD_2};
+  int strides[3] = {SUB_0, SUB_1, SUB_2};
+  int upscale[3] = {1, 1, 1};
+
+#if BORDER_MODE == 0
+  pad[0] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1;
+  pad[1] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1;
+#if NB_DIMS > 2
+  pad[2] = *(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1;
+#endif
+#endif
+
+  if (PyArray_DIM(filt_shp, 0) - 2 != NB_DIMS) {
+    PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
+                 "expected %d, got %lld.", NB_DIMS,
+                 (long long)PyArray_DIM(filt_shp, 0));
+    return -1;
+  }
+
+  err = cudnnCreateConvolutionDescriptor(desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
+                 "descriptor: %s", cudnnGetErrorString(err));
+    return -1;
+  }
+
+  err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale,
+                                        CONV_MODE);
+  return 0;
+}
--- a/theano/sandbox/gpuarray/cudnn_helper.h
+++ b/theano/sandbox/gpuarray/cudnn_helper.h
@@ -4,193 +4,109 @@
 #include <cudnn.h>

 #ifndef CUDNN_VERSION
-#include <assert.h>
-
-// Here we define the R2 API in terms of functions in the R1 interface
-// This is only for what we use

-static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
-  switch (err) {
-  case CUDNN_STATUS_SUCCESS:
-    return "The operation completed successfully.";
-  case CUDNN_STATUS_NOT_INITIALIZED:
-    return "The handle was not initialized(Is your driver recent enought?).";
-  case CUDNN_STATUS_ALLOC_FAILED:
-    return "Ressource allocation failed inside the library.";
-  case CUDNN_STATUS_BAD_PARAM:
-    return "An incorrect value was passed in.";
-  case CUDNN_STATUS_ARCH_MISMATCH:
-    return "The current GPU does not support the required features (only cc 3.0+ are supported).";
-  case CUDNN_STATUS_MAPPING_ERROR:
-    return "An access to GPU memory space failed (probably due to a failure to bind texture).";
-  case CUDNN_STATUS_EXECUTION_FAILED:
-    return "A kernel failed to execute.";
-  case CUDNN_STATUS_INTERNAL_ERROR:
-    return "An internal cuDNN operation failed.";
-  case CUDNN_STATUS_NOT_SUPPORTED:
-    return "The combination of parameters is not currently supported.";
-  default:
-    return "Unknown error code.";
-  }
+#define CUDNN_VERSION -1
+static inline int cudnnGetVersion() {
+  return -1;
 }
+#endif

-// some macros to help support cudnn R1 while using R2 code.
-#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
-#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
-#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
-
-typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
+#include <assert.h>

-static inline cudnnStatus_t
-cudnnGetConvolution2dForwardOutputDim(
-  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t inputTensorDesc,
-  const cudnnFilterDescriptor_t filterDesc,
-  int *n,
-  int *c,
-  int *h,
-  int *w) {
-  return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
-				   n, c, h, w);
-}
+#if CUDNN_VERSION < 3000
+// Here we define the R3 API in terms of functions in the R2 interface
+// This is only for what we use

-typedef int cudnnConvolutionFwdAlgo_t;
-typedef int cudnnConvolutionFwdPreference_t;
+typedef int cudnnConvolutionBwdDataAlgo_t;

-#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
+#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2

-static inline cudnnStatus_t
-cudnnGetConvolutionForwardAlgorithm(
+static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
  cudnnHandle_t handle,
-  const cudnnTensorDescriptor_t srcDesc,
  const cudnnFilterDescriptor_t filterDesc,
+  const cudnnTensorDescriptor_t diffDesc,
  const cudnnConvolutionDescriptor_t convDesc,
-  const cudnnTensorDescriptor_t destDesc,
-  cudnnConvolutionFwdPreference_t preference,
-  size_t memoryLimitInbytes,
-  cudnnConvolutionFwdAlgo_t *algo) {
-  *algo = 0;
-  return CUDNN_STATUS_SUCCESS;
-}
-
-static inline cudnnStatus_t
-cudnnGetConvolutionForwardWorkspaceSize(
- cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensor4dDescriptor_t destDesc,
- cudnnConvolutionFwdAlgo_t algo,
- size_t *sizeInBytes) {
+  const cudnnTensorDescriptor_t gradDesc,
+  cudnnConvolutionBwdDataAlgo_t algo,
+  size_t *sizeInBytes) {
  *sizeInBytes = 0;
  return CUDNN_STATUS_SUCCESS;
 }

-
-static inline cudnnStatus_t
-cudnnConvolutionForward_v2(
+static cudnnStatus_t cudnnConvolutionBackwardData_v3(
  cudnnHandle_t handle,
  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
  const cudnnFilterDescriptor_t filterDesc,
  const void *filterData,
+  const cudnnTensorDescriptor_t diffDesc,
+  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
-  cudnnConvolutionFwdAlgo_t algo,
-  void *workSpace,
-  size_t workSpaceSizeInBytes,
+  cudnnConvolutionBwdDataAlgo_t algo,
+  void *workspace,
+  size_t workspaceSizeInBytes,
  const void *beta,
-  const cudnnTensorDescriptor_t destDesc,
-  void *destData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionForward(handle, srcDesc, srcData,
-				 filterDesc, filterData,
-				 convDesc, destDesc, destData,
-				 r);
+  const cudnnTensorDescriptor_t gradDesc,
+  void *gradData) {
+  return cudnnConvolutionBackwardData(
+    handle,
+    alpha,
+    filterDesc,
+    filterData,
+    diffDesc,
+    diffData,
+    convDesc,
+    beta,
+    gradDesc,
+    gradData);
 }
-#define cudnnConvolutionForward cudnnConvolutionForward_v2

-static inline cudnnStatus_t
-cudnnConvolutionBackwardFilter_v2(
-  cudnnHandle_t	handle,
-  const void *alpha,
-  const cudnnTensorDescriptor_t srcDesc,
-  const void *srcData,
+typedef int cudnnConvolutionBwdFilterAlgo_t;
+
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
+#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
+
+static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
+  cudnnHandle_t handle,
+  const cudnnTensorDescriptor_t filterDesc,
  const cudnnTensorDescriptor_t diffDesc,
-  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
-  const void *beta,
  const cudnnFilterDescriptor_t gradDesc,
-  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
-					diffDesc, diffData,
-					convDesc, gradDesc, gradData,
-					r);
+  cudnnConvolutionBwdDataAlgo_t algo,
+  size_t *sizeInBytes) {
+  *sizeInBytes = 0;
+  return CUDNN_STATUS_SUCCESS;
 }

-#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
-
-static inline cudnnStatus_t
-cudnnConvolutionBackwardData_v2(
-  cudnnHandle_t	handle,
+static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
+  cudnnHandle_t handle,
  const void *alpha,
-  const cudnnFilterDescriptor_t filterDesc,
-  const void *filterData,
+  const cudnnTensorDescriptor_t srcDesc,
+  const void *srcData,
  const cudnnTensorDescriptor_t diffDesc,
  const void *diffData,
  const cudnnConvolutionDescriptor_t convDesc,
+  cudnnConvolutionBwdFilterAlgo_t algo,
+  void *workspace,
+  size_t workspaceSizeInBytes,
  const void *beta,
-  const cudnnTensorDescriptor_t gradDesc,
+  const cudnnFilterDescriptor_t gradDesc,
  void *gradData) {
-  assert(*(float *)alpha == 1.0);
-  cudnnAccumulateResult_t r;
-  if (*(float *)beta == 0.0) {
-    r = CUDNN_RESULT_NO_ACCUMULATE;
-  } else if (*(float *)beta == 1.0) {
-    r = CUDNN_RESULT_ACCUMULATE;
-  } else {
-    assert(0 && "beta must be 0.0 or 1.0");
-  }
-  /* This function needs the casting because its params are not
-     declared as const */
-  return cudnnConvolutionBackwardData(handle,
-				      (cudnnFilterDescriptor_t)filterDesc,
-				      filterData,
-				      (cudnnTensorDescriptor_t)diffDesc,
-				      diffData,
-				      (cudnnConvolutionDescriptor_t)convDesc,
-				      (cudnnTensorDescriptor_t)gradDesc,
-				      gradData,
-				      r);
+  return cudnnConvolutionBackwardFilter(
+    handle,
+    alpha,
+    srcDesc,
+    srcData,
+    diffDesc,
+    diffData,
+    convDesc,
+    beta,
+    gradDesc,
+    gradData);
 }

-#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
-
-//Needed for R2 rc2
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
-#else
-
-// r2 rc1 and rc2 do not have the same macro defined
-// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
-# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
-
 #endif

 #endif
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
--- a/theano/sandbox/gpuarray/dnn_base.c
+++ b/theano/sandbox/gpuarray/dnn_base.c
 #section support_code
-static cudnnHandle_t _handle = NULL;

 static int
-c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
+c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  cudnnDataType_t dt;
  size_t ds;
  switch (var->ga.typecode) {
@@ -12,26 +11,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
+#if CUDNN_VERSION > 3000
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+#endif
  default:
-    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
+    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
    return -1;
  }
  ds = gpuarray_get_elsize(var->ga.typecode);

-  int str0, str1, str2, str3;
-  // cudnn do not like 0s in strides
-  str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
-  str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3];
-  str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3];
-  str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1];
-  cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
-    desc, dt,
-    PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1),
-    PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3),
-    str0, str1, str2, str3);
+  int strs[5], dims[5], default_stride = 1;
+  unsigned int nd = PyGpuArray_NDIM(var);
+
+  if (nd > 5) {
+    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
+    return -1;
+  }
+
+  for (unsigned int _i = nd; _i > 0; _i--) {
+    unsigned int i = _i - 1;
+    strs[i] = PyGpuArray_STRIDE(var, i) ?
+      PyGpuArray_STRIDE(var, i)/ds : default_stride;
+    default_stride *= PyGpuArray_DIM(var, i);
+    dims[i] = PyGpuArray_DIM(var, i);
+  }
+
+  cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
-		 "Could not set tensor4d descriptor: %s",
+		 "Could not set tensorNd descriptor: %s",
 		 cudnnGetErrorString(err));
    return -1;
  }
@@ -53,14 +63,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
  case GA_DOUBLE:
    dt = CUDNN_DATA_DOUBLE;
    break;
+#if CUDNN_VERSION > 3000
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+#endif
  default:
    PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
    return -1;
  }
-  cudnnStatus_t err = cudnnSetFilter4dDescriptor(
-    desc, dt,
-    PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1],
-    PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
+
+  int dims[5];
+  unsigned int nd = PyGpuArray_NDIM(var);
+
+  if (nd > 5) {
+    PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
+    return -1;
+  }
+
+  for (unsigned int _i = nd; _i > 0; _i--) {
+    unsigned int i = _i - 1;
+    dims[i] = PyGpuArray_DIM(var, i);
+  }
+
+  cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError,
 		 "Could not set filter descriptor: %s.",
@@ -72,15 +98,23 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {

 #section init_code

+setup_ext_cuda();
+
+#section support_code_struct
+
+cudnnHandle_t APPLY_SPECIFIC(_handle);
+
+#section init_code_struct
+
 {
+  cuda_enter(pygpu_default_context()->ctx);
  cudnnStatus_t err;
-  if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
+  APPLY_SPECIFIC(_handle) = NULL;
+  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-		 cudnnGetErrorString(err));
-#if PY_MAJOR_VERSION >= 3
-    return NULL;
-#else
-    return;
-#endif
+                 cudnnGetErrorString(err));
+    cuda_exit(pygpu_default_context()->ctx);
+    FAIL;
  }
+  cuda_exit(pygpu_default_context()->ctx);
 }
--- a/theano/sandbox/gpuarray/dnn_conv_base.c
+++ b/theano/sandbox/gpuarray/dnn_conv_base.c
@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL;
 APPLY_SPECIFIC(output) = NULL;
 APPLY_SPECIFIC(kerns) = NULL;
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
 	       "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }
 if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
-  PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
  FAIL;
 }

--- a/theano/sandbox/gpuarray/dnn_fwd.c
+++ b/theano/sandbox/gpuarray/dnn_fwd.c
@@ -10,14 +10,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size");
+		    "images and kernel must have the same stack size");
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -28,6 +29,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -42,56 +44,179 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  Py_INCREF(*output);
 #else
  if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
-                         om->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         om->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*output, om))
    return 1;
 #endif

-  if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
    return 1;

+  cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+#ifdef CHOOSE_ALGO
+  /* Static variables are only initialized once so this will not
+   * reset the previous algo every time */
+  static int reuse_algo = 0;
+  static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_img_dims[5] = {0};
+  static size_t prev_kern_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionFwdAlgoPerf_t choice;
+    err = cudnnFindConvolutionForwardAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess) {
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
+                   "memory information on the GPU: %s\n",
+                   cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionForwardAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      desc, APPLY_SPECIFIC(output),
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    prev_img_dims[i] = PyGpuArray_DIM(input, i);
+    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+  }
+#endif
+
+#endif
+
+  /* These two algos are not supported for 3d conv */
+  if (PyGpuArray_NDIM(input) == 5 &&
+      (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
+       algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
+    algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+
+#if CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
+        (PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    }
+  }
+#endif
+
+#if CUDNN_VERSION < 3000
+  /* cuDNN before v3 does not support kernels larger than input even
+   * if appropriate padding is selected. */
+  for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
+    if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
+      PyErr_SetString(PyExc_RuntimeError, "the current version "
+                      "of CuDNN does not support kernels larger than the "
+                      "inputs in any spatial dimension, even if the inputs "
+                      "are padded such that the padded inputs are larger "
+                      "than the kernels. Update your installation of CuDNN "
+                      "to V3 or more recent to solve the issue.");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+#endif
+
  {
    size_t worksize;
    gpudata *workspace;
-    PyGpuContextObject *c;
-
-    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
+    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
                                                  APPLY_SPECIFIC(output),
-                                                  CONV_ALGO,
+                                                  algo,
                                                  &worksize);
    if (err != CUDNN_STATUS_SUCCESS) {
      PyErr_Format(PyExc_RuntimeError,
-                   "GpuDnnConv: error getting worksize: %s",
+                   "error getting worksize: %s",
                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
      return 1;
    }

-    /* 
+    /*
     * This is less than ideal since we need to free it after (which
     * introduces a synchronization point. But we don't have a module
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
-      c = pygpu_default_context();
      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
+        cuda_exit(c->ctx);
        return 1;
      }
    }

    err = cudnnConvolutionForward(
-      _handle,
+      APPLY_SPECIFIC(_handle),
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
-      desc, CONV_ALGO,
+      desc, algo,
      worksize == 0 ? NULL : *(void **)workspace, worksize,
      beta_p,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
@@ -99,9 +224,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    if (worksize != 0)
      c->ops->buffer_release(workspace);
  }
+  cuda_exit(c->ctx);

  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
 		 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_gi.c
+++ b/theano/sandbox/gpuarray/dnn_gi.c
@@ -9,14 +9,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
-    PyErr_SetString(PyExc_ValueError,
-		    "GpuDnnConv images and kernel must have the same stack size");
+    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
+                    "stack size");
    return 1;
  }

-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;
  if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;
@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -41,26 +43,156 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  Py_INCREF(*input);
 #else
  if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
-                         im->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         im->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*input, im))
    return 1;
 #endif

-  if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
    return 1;

-  err = cudnnConvolutionBackwardData(
-    _handle,
+  cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_kern_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdDataAlgoPerf_t choice;
+
+    err = cudnnFindConvolutionBackwardDataAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionBackwardDataAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
+    prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+
+#endif
+
+#if CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(*input, 0) > 1024 || PyGpuArray_DIM(*input, 1) > 1024 ||
+        (PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+    }
+  }
+#endif
+
+  size_t worksize;
+  gpudata *workspace;
+
+  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
+    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(input), algo, &worksize);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return 1;
+  }
+
+  if (worksize != 0) {
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "Could not allocate working memory");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
+  err = cudnnConvolutionBackwardData_v3(
+    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));
+
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
+
+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_gw.c
+++ b/theano/sandbox/gpuarray/dnn_gw.c
 #section support_code_struct

-int 
+int
 APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  PyGpuContextObject *c = pygpu_default_context();

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -16,9 +17,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    return 1;
  }

-  if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
    return 1;
-  if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
+  if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
    return 1;

  switch (input->ga.typecode) {
@@ -27,6 +28,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    beta_p = (void *)&beta;
    break;
  case GA_FLOAT:
+  case GA_HALF:
    alpha_p = (void *)&af;
    beta_p = (void *)&bf;
    break;
@@ -41,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  Py_INCREF(*kerns);
 #else
  if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
-                         km->ga.typecode, GA_C_ORDER,
-                         pygpu_default_context()) != 0)
+                         km->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (beta != 0.0 && pygpu_move(*kerns, km))
    return 1;
@@ -51,16 +52,148 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
    return 1;

-  err = cudnnConvolutionBackwardFilter(
-    _handle,
+  cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
+
+  cuda_enter(c->ctx);
+
+#ifdef CHOOSE_ALGO
+  static int reuse_algo = 0;
+  static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
+
+#ifndef CHOOSE_ONCE
+  static size_t prev_img_dims[5] = {0};
+  static size_t prev_top_dims[5] = {0};
+
+  reuse_algo = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(input, i) == prev_img_dims[i]);
+    reuse_algo = (reuse_algo &&
+                  PyGpuArray_DIM(output, i) == prev_top_dims[i]);
+  }
+#endif
+
+  if (!reuse_algo) {
+#ifdef CHOOSE_TIME
+    int count;
+    cudnnConvolutionBwdFilterAlgoPerf_t choice;
+
+    err = cudnnFindConvolutionBackwardFilterAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      APPLY_SPECIFIC(kerns), 1, &count, &choice);
+
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    algo = choice.algo;
+#else
+    size_t free = 0, total = 0;
+    cudaError_t err2 = cudaMemGetInfo(&free, &total);
+    if (err2 != cudaSuccess){
+      cudaGetLastError();
+      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
+                   "information on the GPU: %s\n", cudaGetErrorString(err2));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    err = cudnnGetConvolutionBackwardFilterAlgorithm(
+      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      desc, APPLY_SPECIFIC(kerns),
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error selecting convolution algo: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+#endif
+    prev_algo = algo;
+  } else {
+    algo = prev_algo;
+  }
+
+#ifdef CHOOSE_ONCE
+  reuse_algo = 1;
+#else
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    prev_img_dims[i] = PyGpuArray_DIM(input, i);
+    prev_top_dims[i] = PyGpuArray_DIM(output, i);
+  }
+#endif
+
+#endif
+
+#ifdef CUDNN_VERSION > 3000
+  if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) {
+    int nd;
+    int pad[2];
+    int stride[2];
+    int upscale[2];
+    cudnnConvolutionMode_t mode;
+    err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
+                                          upscale, &mode);
+    if (err != CUDNN_STATUS_SUCCESS) {
+      PyErr_Format(PyExc_RuntimeError,
+                   "error getting convolution properties: %s",
+                   cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+      return 1;
+    }
+
+    if (stride[0] != 1 || stride[1] != 1 ||
+        PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
+        (PyGpuArray_DIM(*kerns, 0) == 1 && PyGpuArray_DIM(*kerns, 1) == 1)) {
+      algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+    }
+  }
+#endif
+
+  size_t worksize;
+  gpudata *workspace;
+
+  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    APPLY_SPECIFIC(kerns), algo, &worksize);
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
+                 cudnnGetErrorString(err));
+      cuda_exit(c->ctx);
+    return 1;
+  }
+
+  if (worksize != 0) {
+    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    if (workspace == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
+      cuda_exit(c->ctx);
+      return 1;
+    }
+  }
+
+  err = cudnnConvolutionBackwardFilter_v3(
+    APPLY_SPECIFIC(_handle),
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
-    desc,
+    desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize,
    beta_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));
+
+  if (worksize != 0)
+    c->ops->buffer_release(workspace);
+
+  cuda_exit(c->ctx);
+
  if (err != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
                 cudnnGetErrorString(err));
    return 1;
  }

--- a/theano/sandbox/gpuarray/dnn_pool.c
+++ b/theano/sandbox/gpuarray/dnn_pool.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
+                             cudnnPoolingDescriptor_t desc,
+                             PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  size_t dims[5];
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  cudnnPoolingMode_t mode;
+  int w[3];
+  int p[3];
+  int s[3];
+  int ndims;
+
+  err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "error doing cudnnGetPoolingDescriptor operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+
+  dims[0] = PyGpuArray_DIM(img, 0);
+  dims[1] = PyGpuArray_DIM(img, 1);
+  dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
+  dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
+  if (ndims == 3)
+    dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
+
+  if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (img->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingForward(
+      APPLY_SPECIFIC(_handle), desc,
+      alpha,
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
+      beta,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
+    cuda_exit(c->ctx);
+  }
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "GpuDnnPool: error doing cudnnPoolingForward operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_pool_grad.c
+++ b/theano/sandbox/gpuarray/dnn_pool_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(input_grad) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(output_grad) = NULL;
+
+{
+  cudnnStatus_t err;
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (input_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError,
+                 "could not allocate tensor descriptor (output_grad): %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
+                                  PyGpuArrayObject *out,
+                                  PyGpuArrayObject *out_grad,
+                                  cudnnPoolingDescriptor_t desc,
+                                  PyGpuArrayObject **inp_grad) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous output gradients are supported.");
+    return 1;
+  }
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
+    return 1;
+  }
+
+  if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+  if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
+    return 1;
+  if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
+                         PyGpuArray_DIMS(inp), inp->ga.typecode,
+                         GA_C_ORDER, pygpu_default_context()) != 0) {
+    return 1;
+  }
+
+  if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (inp->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in pooling gradient");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnPoolingBackward(
+      APPLY_SPECIFIC(_handle), desc,
+      alpha,
+      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
+      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
+      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
+      beta,
+      APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax.c
+++ b/theano/sandbox/gpuarray/dnn_softmax.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(input) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
+if (APPLY_SPECIFIC(output) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
+                            PyGpuArrayObject **out) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+
+  if (theano_prep_output(out, PyGpuArray_NDIM(x),
+                         PyGpuArray_DIMS(x), x->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (x->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxForward(
+      APPLY_SPECIFIC(_handle),
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      alpha,
+      APPLY_SPECIFIC(input),
+      PyGpuArray_DEV_DATA(x),
+      beta,
+      APPLY_SPECIFIC(output),
+      PyGpuArray_DEV_DATA(*out)
+    );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/dnn_softmax_grad.c
+++ b/theano/sandbox/gpuarray/dnn_softmax_grad.c
+#section support_code_struct
+
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(dx);
+
+#section init_code_struct
+
+APPLY_SPECIFIC(dy) = NULL;
+APPLY_SPECIFIC(sm) = NULL;
+APPLY_SPECIFIC(dx) = NULL;
+
+{
+  cudnnStatus_t err;
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+  err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dx));
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    FAIL;
+  }
+}
+
+#section cleanup_code_struct
+
+if (APPLY_SPECIFIC(dy) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
+if (APPLY_SPECIFIC(sm) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
+if (APPLY_SPECIFIC(dx) != NULL)
+  cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dx));
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
+                                 PyGpuArrayObject *sm,
+                                 PyGpuArrayObject **dx) {
+  cudnnStatus_t err;
+  PyGpuContextObject *c = pygpu_default_context();
+
+  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
+    return 1;
+  if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
+    return 1;
+
+  if (theano_prep_output(dx, PyGpuArray_NDIM(dy),
+                         PyGpuArray_DIMS(dy), dy->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+
+  if (c_set_tensorNd(*dx, APPLY_SPECIFIC(dx)) != 0)
+    return 1;
+
+  {
+    const float alphaf = 1;
+    const float betaf = 0;
+    const double alphad = 1;
+    const double betad = 0;
+    void *alpha, *beta;
+
+    switch (sm->ga.typecode) {
+    case GA_DOUBLE:
+      alpha = (void *)&alphad;
+      beta = (void *)&betad;
+      break;
+    case GA_FLOAT:
+    case GA_HALF:
+      alpha = (void *)&alphaf;
+      beta = (void *)&betaf;
+      break;
+    default:
+      PyErr_SetString(PyExc_TypeError, "Unsupported type in softmax gradient");
+      return 1;
+    }
+
+    cuda_enter(c->ctx);
+    err = cudnnSoftmaxBackward(
+      APPLY_SPECIFIC(_handle),
+      SOFTMAX_ALGO,
+      SOFTMAX_MODE,
+      alpha,
+      APPLY_SPECIFIC(sm),
+      PyGpuArray_DEV_DATA(sm),
+      APPLY_SPECIFIC(dy),
+      PyGpuArray_DEV_DATA(dy),
+      beta,
+      APPLY_SPECIFIC(dx),
+      PyGpuArray_DEV_DATA(*dx)
+      );
+    cuda_exit(c->ctx);
+  }
+
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/sandbox/gpuarray/tests/test_dnn.py
+++ b/theano/sandbox/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/gpuarray/tests/test_nnet.py
+++ b/theano/sandbox/gpuarray/tests/test_nnet.py
@@ -326,7 +326,6 @@ class test_SoftMax(unittest.TestCase):
        return f, f_gpu

    def _cmp(self, n, m, f, f_gpu):
-        # print "test_softmax",n,m
        data = numpy.arange(n * m, dtype='float32').reshape(n, m)
        out = f(data)
        gout = f_gpu(data)
@@ -349,8 +348,6 @@ class test_SoftMax(unittest.TestCase):
            self._cmp
        )

-        # cuDNN R1 cannot handle these test cases but the Theano softmax can so
-        # we test them only for the Theano softmax.
        self._cmp(2 << 15, 5, f, f_gpu)

    def test_softmax_shape_0(self):