Merge pull request #4915 from abergeron/dnn_rnn2

Cudnn RNN bindings.

Merge pull request #4915 from abergeron/dnn_rnn2
3007bf79 · Pascal Lamblin · GitHub · d72325e4 · c27959ba · 3007bf79
--- a/theano/compile/tests/test_nanguardmode.py
+++ b/theano/compile/tests/test_nanguardmode.py
@@ -14,11 +14,9 @@ import theano.tensor as T
 def test_NanGuardMode():
-    """
+    # Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
-    Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
+    # intentionally. A working implementation should be able to capture all
-    intentionally. A working implementation should be able to capture all
+    # the abnormalties.
-    the abnormalties.
-    """
    x = T.matrix()
    w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
    y = T.dot(x, w)

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -10,7 +10,7 @@ import sys
 import warnings
 from functools import wraps
-from six import StringIO, PY3
+from six import StringIO, PY3, iteritems
 import theano
 from theano.compat import configparser as ConfigParser
@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser()
 theano_raw_cfg.read(config_files)
-def change_flags(**kwargs):
+class change_flags(object):
    """
-    Use this as a decorator to change the value of Theano config variable.
+    Use this as a decorator or context manager to change the value of
+    Theano config variables.
    Useful during tests.
    """
-    def change_flags_exec(f):
+    def __init__(self, **kwargs):
+        confs = dict()
+        for k in kwargs:
+            l = [v for v in theano.configparser._config_var_list
+                 if v.fullname == k]
+            assert len(l) == 1
+            confs[k] = l[0]
+        self.confs = confs
+        self.new_vals = kwargs
+    def __call__(self, f):
        @wraps(f)
-        def inner(*args, **kwargs_):
+        def res(*args, **kwargs):
-            old_val = {}
+            with self:
-            for k in kwargs:
+                return f(*args, **kwargs)
-                l = [v for v in theano.configparser._config_var_list
+        return res
-                     if v.fullname == k]
-                assert len(l) == 1
+    def __enter__(self):
-                old_val[k] = l[0].__get__(True, None)
+        self.old_vals = {}
-            try:
+        for k, v in iteritems(self.confs):
-                for k in kwargs:
+            self.old_vals[k] = v.__get__(True, None)
-                    l = [v for v in theano.configparser._config_var_list
+        try:
-                         if v.fullname == k]
+            for k, v in iteritems(self.confs):
-                    assert len(l) == 1
+                v.__set__(None, self.new_vals[k])
-                    l[0].__set__(None, kwargs[k])
+        except:
-                return f(*args, **kwargs_)
+            self.__exit__()
-            finally:
+            raise
-                for k in kwargs:
-                    l = [v for v in theano.configparser._config_var_list
+    def __exit__(self, *args):
-                         if v.fullname == k]
+        for k, v in iteritems(self.confs):
-                    assert len(l) == 1
+            v.__set__(None, self.old_vals[k])
-                    l[0].__set__(None, old_val[k])
-        return inner
-    return change_flags_exec
 def fetch_val_for_key(key, delete_key=False):

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -696,6 +696,9 @@ class PureOp(object):
    # Python implementation #
    #########################
+    def L_op(self, inputs, outputs, output_grads):
+        return self.grad(inputs, output_grads)
    def R_op(self, inputs, eval_points):
        """
        This method is primarily used by tensor.Rop

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -14,6 +14,7 @@ import theano
 from theano.gof import utils
 from theano.gof.utils import MethodNotDefined, object2
 from theano.gof import graph
+from theano.configparser import change_flags
 ########
 # Type #
@@ -638,6 +639,8 @@ class CDataType(Type):
        have a `void` return and take a single pointer argument.
    """
+    __props__ = ('ctype', 'freefunc', 'headers', 'header_dirs',
+                 'libraries', 'lib_dirs', 'extra_support_code')
    def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None,
                 libraries=None, lib_dirs=None, extra_support_code=""):
@@ -647,42 +650,51 @@ class CDataType(Type):
            assert isinstance(freefunc, string_types)
        self.freefunc = freefunc
        if headers is None:
-            headers = []
+            headers = ()
-        self.headers = headers
+        self.headers = tuple(headers)
        if header_dirs is None:
-            header_dirs = []
+            header_dirs = ()
-        self.header_dirs = header_dirs
+        self.header_dirs = tuple(header_dirs)
        if libraries is None:
-            libraries = []
+            libraries = ()
-        self.libraries = libraries
+        self.libraries = tuple(libraries)
        if lib_dirs is None:
-            lib_dirs = []
+            lib_dirs = ()
-        self.lib_dirs = lib_dirs
+        self.lib_dirs = tuple(lib_dirs)
        self.extra_support_code = extra_support_code
        self._fn = None
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.ctype == other.ctype and
-                self.freefunc == other.freefunc)
-    def __hash__(self):
-        return hash((type(self), self.ctype, self.freefunc))
    def filter(self, data, strict=False, allow_downcast=None):
        if data is not None and not isinstance(data, _cdata_type):
            raise TypeError("expected None or a PyCapsule")
        return data
    def _get_func(self):
+        """
+        Return a function that makes a value from an integer.
+        The integer value is assumed to be a valid pointer for the
+        type and no check is done to ensure that.
+        """
        from theano.scalar import get_scalar_type
        if self._fn is None:
-            v = get_scalar_type('int64')()
+            with change_flags(compute_test_value='off'):
-            self._fn = theano.function([v], _make_cdata(self)(v), profile=False)
+                v = get_scalar_type('int64')()
+                self._fn = theano.function([v], _make_cdata(self)(v),
+                                           profile=False)
        return self._fn
    def make_value(self, ptr):
+        """
+        Make a value of this type.
+        Parameters
+        ----------
+        ptr : int
+            Integer representation of a valid pointer value
+        """
        return self._get_func()(ptr)
    def c_declare(self, name, sub, check_input=True):

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -26,7 +26,7 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined)
+                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
 from . import fft, dnn, opt, nerv, extra_ops, multinomial
@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
              (name, dev, context.devname),
              file=sys.stderr)
    pygpu_activated = True
+    ctx_props = _get_props(name)
+    ctx_props['dev'] = dev
    if dev.startswith('cuda'):
-        try:
+        if 'cudnn_version' not in ctx_props:
-            cudnn_version = dnn.version()
+            try:
-            # 5200 should not print warning with cudnn 5.1 final.
+                ctx_props['cudnn_version'] = dnn.version()
-            if cudnn_version >= 5200:
+                # 5200 should not print warning with cudnn 5.1 final.
-                warnings.warn("Your cuDNN version is more recent than Theano."
+                if ctx_props['cudnn_version'] >= 5200:
-                              " If you see problems, try updating Theano or"
+                    warnings.warn("Your cuDNN version is more recent than "
-                              " downgrading cuDNN to version 5.1.")
+                                  "Theano. If you encounter problems, try "
-            if config.print_active_device:
+                                  "updating Theano or downgrading cuDNN to "
-                print("Using cuDNN version %d on context %s" %
+                                  "version 5.1.")
-                      (cudnn_version, name), file=sys.stderr)
+                if config.print_active_device:
-        except Exception:
+                    print("Using cuDNN version %d on context %s" %
-            pass
+                          (ctx_props['cudnn_version'], name), file=sys.stderr)
+                ctx_props['cudnn_handle'] = dnn._make_handle(context)
+            except Exception:
+                pass
 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
 #section init_code
 setup_ext_cuda();
-#section support_code_struct
-PyGpuContextObject *ctx;
-cudnnHandle_t APPLY_SPECIFIC(_handle);
-#section init_code_struct
-{
-  // We need to keep a reference here to have it available in the destructor.
-  ctx = PARAMS;
-  Py_INCREF(ctx);
-  cuda_enter(PARAMS->ctx);
-  cudnnStatus_t err;
-  APPLY_SPECIFIC(_handle) = NULL;
-  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
-                            cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  cuda_exit(PARAMS->ctx);
-}
-#section cleanup_code_struct
-cuda_enter(ctx->ctx);
-cudnnDestroy(APPLY_SPECIFIC(_handle));
-cuda_exit(ctx->ctx);
-Py_DECREF((PyObject *)ctx);
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd, PyGpuContextObject *c) {
+                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
                       PyGpuArrayObject *x_invstd, npy_float64 epsilon,
                       PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
-                       PyGpuArrayObject **dbias, PyGpuContextObject *c) {
+                       PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(doutp, bn_doutput) != 0)
@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
      betaParam = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alphaData,
      betaData,

--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
                     PyGpuArrayObject *est_var, npy_float64 epsilon, 
-                     PyGpuArrayObject **outp, PyGpuContextObject *c) {
+                     PyGpuArrayObject **outp, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_dropout_desc.c
+++ b/theano/gpuarray/dnn_dropout_desc.c
+#section support_code
+int dnn_dropout_desc(float dropout, unsigned long long seed,
+                     PyGpuContextObject *c,
+                     cudnnDropoutDescriptor_t *odesc,
+                     PyGpuArrayObject **ostates,
+                     cudnnHandle_t _handle) {
+  PyGpuArrayObject *states;
+  cudnnDropoutDescriptor_t desc;
+  size_t states_sz;
+  cudnnStatus_t err;
+  cuda_enter(c->ctx);
+  err = cudnnCreateDropoutDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  /* Can't fail according to docs */
+  cudnnDropoutGetStatesSize(_handle, &states_sz);
+  states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
+  if (states == NULL) {
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
+                                  PyGpuArray_DEV_DATA(states),
+                                  states_sz, seed);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
+    Py_DECREF((PyObject *)states);
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  *odesc = desc;
+  *ostates = states;
+  return 0;
+}
--- a/theano/gpuarray/dnn_dropout_fwd.c
+++ b/theano/gpuarray/dnn_dropout_fwd.c
+#section support_code
+int dnn_dropout_fwd(PyGpuArrayObject *x,
+                    cudnnDropoutDescriptor_t *desc,
+                    PyGpuArrayObject *state,
+                    PyGpuArrayObject **y,
+                    PyGpuArrayObject **ostate,
+                    gpudata **reserve,
+                    cudnnHandle_t _handle) {
+  PyGpuArrayContext *c = x->context;
+  cudnnTensorDescriptor_t xdesc;
+  cudnnTensorDescriptor_t ydesc;
+  gpudata *res;
+  size_t res_sz;
+  cudnnStatus_t err;
+  if (c_make_tensorNd(x, &xdesc))
+    return -1;
+  if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
+                         GA_C_ORDER, c)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  if (c_make_tensorNd(y, &ydesc)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  *ostate = state;
+  Py_INCREF((PyObject *)state);
+  /* This can't fail according to the docs */
+  err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
+  res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
+  if (res == NULL) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyTensorDescriptor(ydesc);
+    PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
+  }
+  *reserve = res;
+  cuda_enter(c->ctx);
+  err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
+                            ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
+                            res_sz);
+  cudnnDestroyTensorDescriptor(xdesc);
+  cudnnDestroyTensorDescriptor(ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not run dropout: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  return 0;
+}
--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
-                         PyGpuContextObject *c) {
+                         cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = input->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }
    err = cudnnGetConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    size_t worksize;
    gpudata *workspace;
-    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      // TODO: Print a warning
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-      err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnConvolutionForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **input,
-                        PyGpuContextObject *c) {
+                        cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = kerns->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    cudnnConvolutionBwdDataAlgoPerf_t choice;
    err = cudnnFindConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(input), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }
    err = cudnnGetConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),
      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  gpudata *workspace;
  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  err = cudnnConvolutionBackwardData(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
-                        PyGpuContextObject *c) {
+                        cudnnHandle_t _handle) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
+  PyGpuContextObject *c = input->context;
-  float af = alpha, bf = beta;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    cudnnConvolutionBwdFilterAlgoPerf_t choice;
    err = cudnnFindConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }
    err = cudnnGetConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  gpudata *workspace;
  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);
  if (err != CUDNN_STATUS_SUCCESS) {
@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
  err = cudnnConvolutionBackwardFilter(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             PyArrayObject *stride,
                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
-                             PyGpuContextObject *c) {
+                             cudnnHandle_t _handle) {
-  cudnnStatus_t err;
+  PyGpuContextObject *c = img->context;
  size_t dims[5];
+  cudnnStatus_t err;
  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
-                                  PyGpuContextObject *c) {
+                                  cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  cudnnStatus_t err;
  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/gpuarray/dnn_rnn_desc.c
+++ b/theano/gpuarray/dnn_rnn_desc.c
+#section support_code
+int dnn_rnn_desc(int hidden_size, int num_layers,
+                 cudnnDropoutDescriptor_t ddesc,
+                 int input_mode, int direction_mode, int rnn_mode,
+                 int dtype, cudnnRNNDescriptor_t *odesc,
+                 cudnnHandle_t _handle) {
+  cudnnRNNDescriptor_t desc;
+  cudnnDataType_t data_type;
+  cudnnStatus_t err;
+  switch (dtype) {
+  case GA_FLOAT:
+    data_type = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    data_type = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    data_type = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateRNNDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
+    return -1;
+  }
+  err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
+                              (cudnnRNNInputMode_t)input_mode,
+                              (cudnnDirectionMode_t)direction_mode,
+                              (cudnnRNNMode_t)rnn_mode, data_type);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyRNNDescriptor(desc);
+    PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
+    return -1;
+  }
+  *odesc = desc;
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_fwd.c
+++ b/theano/gpuarray/dnn_rnn_fwd.c
+#section support_code
+int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
+                PyGpuArrayObject *w, PyGpuArrayObject *x,
+                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
+                gpudata **reserve, PyGpuArrayObject **y,
+                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
+                cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t hydesc = NULL;
+  cudnnTensorDescriptor_t cydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(x, 0);
+  size_t miniBatch = PyGpuArray_DIM(x, 1);
+  size_t inputSize = PyGpuArray_DIM(x, 2);
+  size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
+  size_t shape[3];
+  int strs[3], dims[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[1] * dims[2];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = hiddenSizeDir;
+  if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
+                         hx->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*hy, &hydesc) != 0)
+    goto fail;
+  if (cy != NULL) {
+    if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
+                           cx->ga.typecode, GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*cy, &cydesc) != 0)
+      goto fail;
+  }
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*reserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                                wdesc, PyGpuArray_DEV_DATA(w),
+                                yl, PyGpuArray_DEV_DATA(*y),
+                                hydesc, PyGpuArray_DEV_DATA(*hy),
+                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
+                                *(void **)workspace, worksize,
+                                *(void **)(*reserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+ fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (hydesc != NULL)
+    cudnnDestroyTensorDescriptor(hydesc);
+  if (cydesc != NULL)
+    cudnnDestroyTensorDescriptor(cydesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gi.c
+++ b/theano/gpuarray/dnn_rnn_gi.c
+#section support_code
+int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
+               PyGpuArrayObject *y, PyGpuArrayObject *dy,
+               PyGpuArrayObject *w, PyGpuArrayObject *hx,
+               gpudata *reserve, PyGpuArrayObject *cx,
+               PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
+               gpudata **oreserve, PyGpuArrayObject **dx,
+               PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
+               cudnnHandle_t _handle) {
+  PyGpuContextObject *c = y->context;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t dhydesc = NULL;
+  cudnnTensorDescriptor_t dcydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t dxdesc = NULL;
+  cudnnTensorDescriptor_t dhxdesc = NULL;
+  cudnnTensorDescriptor_t dcxdesc = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  cudnnTensorDescriptor_t *dxl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(y, 0);
+  size_t miniBatch = PyGpuArray_DIM(y, 1);
+  size_t inputSize = xshp;
+  size_t shape[3];
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (y->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
+    return -1;
+  }
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (dhy != NULL)
+    if (c_make_tensorNd(dhy, &dhydesc) != 0)
+      goto fail;
+  if (dcy != NULL)
+    if (c_make_tensorNd(dcy, &dcydesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = inputSize;
+  if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&dxdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
+    goto fail;
+  if (cx != NULL) {
+    if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
+                           GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
+      goto fail;
+  }
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (dxl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    dxl[i] = dxdesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       dxl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*oreserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
+    goto fail;
+  }
+  err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
+                             yl, PyGpuArray_DEV_DATA(y),
+                             /* y and dy are the same shape */
+                             yl, PyGpuArray_DEV_DATA(dy),
+                             dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
+                             dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
+                             wdesc, PyGpuArray_DEV_DATA(w),
+                             hxdesc, PyGpuArray_DEV_DATA(hx),
+                             cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                             dxl, PyGpuArray_DEV_DATA(*dx),
+                             dhxdesc, PyGpuArray_DEV_DATA(*dhx),
+                             dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
+                             *(void **)workspace, worksize,
+                             *(void **)(*oreserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad inputs: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dhydesc != NULL)
+    cudnnDestroyTensorDescriptor(dhydesc);
+  if (dcydesc != NULL)
+    cudnnDestroyTensorDescriptor(dcydesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (dxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dxdesc);
+  if (dhxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dhxdesc);
+  if (dcxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dcxdesc);
+  free(yl);
+  free(dxl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gw.c
+++ b/theano/gpuarray/dnn_rnn_gw.c
+#section support_code
+int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
+               PyGpuArrayObject *x, PyGpuArrayObject *hx,
+               PyGpuArrayObject *y, gpudata *reserve,
+               PyGpuArrayObject **dw, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnFilterDescriptor_t dwdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t iters = PyGpuArray_DIM(x, 0);
+  size_t wsize = _wsize;
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  GpuArray_memset(&(*dw)->ga, 0);
+  if (c_make_filter(*dw, &dwdesc) != 0)
+    goto fail;
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                yl, PyGpuArray_DEV_DATA(y),
+                                *(void **)workspace, worksize,
+                                dwdesc, PyGpuArray_DEV_DATA(*dw),
+                                *(void **)reserve, ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad weights: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dwdesc != NULL)
+    cudnnDestroyFilterDescriptor(dwdesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_paramsize.c
+++ b/theano/gpuarray/dnn_rnn_paramsize.c
+#section support_code
+int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
+                      PyArrayObject *isize,
+                      npy_int32 typecode,
+                      npy_uint64 *oparam_size,
+                      cudnnHandle_t _handle) {
+  cudnnTensorDescriptor_t xdesc;
+  size_t param_size;
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int shape[3];
+  int strides[3];
+  if (PyArray_DIM(isize, 0) != 2) {
+    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
+    return -1;
+  }
+  switch (typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
+    return -1;
+  }
+  shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
+  shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
+  shape[2] = 1;
+  strides[0] = shape[2] * shape[1];
+  strides[1] = shape[2];
+  strides[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+  err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
+    return -1;
+  }
+  cudnnDestroyTensorDescriptor(xdesc);
+  *oparam_size = param_size;
+  return 0;
+}
--- a/theano/gpuarray/dnn_softmax.c
+++ b/theano/gpuarray/dnn_softmax.c
@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
                            PyGpuArrayObject **out,
-                            PyGpuContextObject *c) {
+                            cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
  cudnnStatus_t err;
  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnSoftmaxForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,

--- a/theano/gpuarray/dnn_softmax_grad.c
+++ b/theano/gpuarray/dnn_softmax_grad.c
@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
                                 PyGpuArrayObject **dx,
-                                 PyGpuContextObject *c) {
+                                 cudnnHandle_t _handle) {
+  PyGpuContextObject *c = dy->context;
  cudnnStatus_t err;
  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
    cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
    err = cudnnSoftmaxBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,

--- a/theano/gpuarray/tests/rnn_support.py
+++ b/theano/gpuarray/tests/rnn_support.py
+from __future__ import absolute_import, print_function, division
+import theano
+import theano.tensor as T
+import numpy
+class Model(object):
+    def __init__(self, name=""):
+        self.name = name
+        self.layers = []
+        self.params = []
+        self.other_updates = {}
+    def add_layer(self, layer):
+        self.layers.append(layer)
+        for p in layer.params:
+            self.params.append(p)
+        if hasattr(layer, 'other_updates'):
+            for y in layer.other_updates:
+                self.other_updates[y[0]] = y[1]
+    def get_params(self):
+        return self.params
+def uniform(stdev, size):
+    """uniform distribution with the given stdev and size"""
+    return numpy.random.uniform(
+        low=-stdev * numpy.sqrt(3),
+        high=stdev * numpy.sqrt(3),
+        size=size
+    ).astype(theano.config.floatX)
+def linear_transform_weights(input_dim, output_dim,
+                             param_list=None, name=""):
+    "theano shared variable given input and output dimension"
+    weight_inialization = uniform(numpy.sqrt(2.0 / input_dim),
+                                  (input_dim, output_dim))
+    W = theano.shared(weight_inialization, name=name)
+    assert(param_list is not None)
+    param_list.append(W)
+    return W
+def bias_weights(length, param_list=None, name=""):
+    "theano shared variable for bias unit, given length"
+    bias_initialization = numpy.zeros(length).astype(theano.config.floatX)
+    bias = theano.shared(
+        bias_initialization,
+        name=name
+        )
+    if param_list is not None:
+        param_list.append(bias)
+    return bias
+class Layer(object):
+    '''Generic Layer Template which all layers should inherit'''
+    def __init__(self, name=""):
+        self.name = name
+        self.params = []
+    def get_params(self):
+        return self.params
+class GRU(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
+        '''Layers information'''
+        self.name = name
+        self.input_dim = input_dim
+        self.hidden_dim = output_dim
+        self.output_dim = output_dim
+        self.input_layer = input_layer
+        self.X = input_layer.output()
+        self.s0 = s0
+        self.params = []
+        '''Layers weights'''
+        '''self.params is passed so that any paramters could be appended to it'''
+        self.W_r = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_r")
+        self.b_wr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wr")
+        self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
+        self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
+        self.W_h = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_h")
+        self.b_wh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wh")
+        self.R_r = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_r")
+        self.b_rr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rr")
+        self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
+        self.b_ru = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ru")
+        self.R_h = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_h")
+        self.b_rh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rh")
+        '''step through processed input to create output'''
+        def step(inp, s_prev):
+            i_t = T.nnet.sigmoid(
+                T.dot(inp, self.W_i) + T.dot(s_prev, self.R_i) + self.b_wi + self.b_ru)
+            r_t = T.nnet.sigmoid(
+                T.dot(inp, self.W_r) + T.dot(s_prev, self.R_r) + self.b_wr + self.b_rr)
+            h_hat_t = T.tanh(
+                T.dot(inp, self.W_h) + (r_t * (T.dot(s_prev, self.R_h) + self.b_rh)) + self.b_wh)
+            s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
+            return s_curr
+        outputs_info = self.s0
+        states, updates = theano.scan(
+            fn=step,
+            sequences=[self.X],
+            outputs_info=outputs_info
+            )
+        self.Y = states
+    def output(self):
+        return self.Y
+class LSTM(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None,
+                 name=""):
+        '''Layers information'''
+        self.name = name
+        self.input_dim = input_dim
+        self.hidden_dim = output_dim
+        self.output_dim = output_dim
+        self.input_layer = input_layer
+        self.X = input_layer.output()
+        self.s0 = s0
+        self.c0 = c0
+        self.params = []
+        '''Layers weights'''
+        '''self.params is passed so that any paramters could be appended to it'''
+        self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
+        self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
+        self.W_f = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_f")
+        self.b_wf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wf")
+        self.W_c = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_c")
+        self.b_wc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wc")
+        self.W_o = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_o")
+        self.b_wo = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wo")
+        self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
+        self.b_ri = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ri")
+        self.R_f = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_f")
+        self.b_rf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rf")
+        self.R_c = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_c")
+        self.b_rc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rc")
+        self.R_o = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_o")
+        self.b_ro = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ro")
+        '''step through processed input to create output'''
+        def step(x_t, h_tm1, c_tm1):
+            i_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_i) + T.dot(h_tm1, self.R_i) + self.b_wi + self.b_ri)
+            f_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_f) + T.dot(h_tm1, self.R_f) + self.b_wf + self.b_rf)
+            o_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_o) + T.dot(h_tm1, self.R_o) + self.b_ro + self.b_wo)
+            c_hat_t = T.tanh(
+                T.dot(x_t, self.W_c) + T.dot(h_tm1, self.R_c) + self.b_wc + self.b_rc)
+            c_t = f_t * c_tm1 + i_t * c_hat_t
+            h_t = o_t * T.tanh(c_t)
+            return h_t, c_t
+        outputs_info = [self.s0, self.c0]
+        states, updates = theano.scan(
+            fn=step,
+            sequences=[self.X],
+            outputs_info=outputs_info
+            )
+        self.Y = states[0]
+        self.C = states[1]
+    def output(self):
+        return self.Y
+class FC(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, name=""):
+        self.input_layer = input_layer
+        self.name = name
+        self.params = []
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.X = self.input_layer.output()
+        self.W = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W")
+        self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
+    def output(self):
+        return T.dot(self.X, self.W) + self.b
+class WrapperLayer(Layer):
+    def __init__(self, X, name=""):
+        self.params = []
+        self.name = name
+        self.X = X
+    def output(self):
+        return self.X
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -68,6 +68,7 @@ def reg_context(name, ctx):
    if not isinstance(ctx, gpuarray.GpuContext):
        raise TypeError("context is not GpuContext")
    _context_reg[name] = ctx
+    _props_map[ctx] = dict()
 def get_context(name):
@@ -96,6 +97,26 @@ def list_contexts():
    """
    return _context_reg.keys()
+# Mappings of properties to contexts.  Please never use this if you
+# can avoid it.
+# This is basically a way to store "global" variables that depend on
+# the context.
+_props_map = {}
+def _get_props(name):
+    ctx = get_context(name)
+    return _props_map[ctx]
+def get_prop(name, k):
+    return _get_props(name)[k]
+def set_prop(name, k, v):
+    _get_props(name)[k] = v
 # Private method
 def _name_for_ctx(ctx):

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx,
                                str(o_shape) + " on an output of shape " +
                                str(g_shape))
-                input_grads = node.op.grad(inputs, new_output_grads)
+                input_grads = node.op.L_op(inputs, node.outputs,
+                                           new_output_grads)
                if input_grads is None:
                    raise TypeError("%s.grad returned NoneType, "