Merge pull request #4915 from abergeron/dnn_rnn2

Cudnn RNN bindings.

Merge pull request #4915 from abergeron/dnn_rnn2
3007bf79 · Pascal Lamblin · GitHub · d72325e4 · c27959ba · 3007bf79
--- a/theano/compile/tests/test_nanguardmode.py
+++ b/theano/compile/tests/test_nanguardmode.py
@@ -14,11 +14,9 @@ import theano.tensor as T


 def test_NanGuardMode():
-    """
-    Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
-    intentionally. A working implementation should be able to capture all
-    the abnormalties.
-    """
+    # Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
+    # intentionally. A working implementation should be able to capture all
+    # the abnormalties.
    x = T.matrix()
    w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
    y = T.dot(x, w)

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -10,7 +10,7 @@ import sys
 import warnings
 from functools import wraps

-from six import StringIO, PY3
+from six import StringIO, PY3, iteritems

 import theano
 from theano.compat import configparser as ConfigParser
@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser()
 theano_raw_cfg.read(config_files)


-def change_flags(**kwargs):
+class change_flags(object):
    """
-    Use this as a decorator to change the value of Theano config variable.
+    Use this as a decorator or context manager to change the value of
+    Theano config variables.

    Useful during tests.
    """
-    def change_flags_exec(f):
+    def __init__(self, **kwargs):
+        confs = dict()
+        for k in kwargs:
+            l = [v for v in theano.configparser._config_var_list
+                 if v.fullname == k]
+            assert len(l) == 1
+            confs[k] = l[0]
+        self.confs = confs
+        self.new_vals = kwargs
+
+    def __call__(self, f):
        @wraps(f)
-        def inner(*args, **kwargs_):
-            old_val = {}
-            for k in kwargs:
-                l = [v for v in theano.configparser._config_var_list
-                     if v.fullname == k]
-                assert len(l) == 1
-                old_val[k] = l[0].__get__(True, None)
-            try:
-                for k in kwargs:
-                    l = [v for v in theano.configparser._config_var_list
-                         if v.fullname == k]
-                    assert len(l) == 1
-                    l[0].__set__(None, kwargs[k])
-                return f(*args, **kwargs_)
-            finally:
-                for k in kwargs:
-                    l = [v for v in theano.configparser._config_var_list
-                         if v.fullname == k]
-                    assert len(l) == 1
-                    l[0].__set__(None, old_val[k])
-
-        return inner
-    return change_flags_exec
+        def res(*args, **kwargs):
+            with self:
+                return f(*args, **kwargs)
+        return res
+
+    def __enter__(self):
+        self.old_vals = {}
+        for k, v in iteritems(self.confs):
+            self.old_vals[k] = v.__get__(True, None)
+        try:
+            for k, v in iteritems(self.confs):
+                v.__set__(None, self.new_vals[k])
+        except:
+            self.__exit__()
+            raise
+
+    def __exit__(self, *args):
+        for k, v in iteritems(self.confs):
+            v.__set__(None, self.old_vals[k])


 def fetch_val_for_key(key, delete_key=False):

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -696,6 +696,9 @@ class PureOp(object):
    # Python implementation #
    #########################

+    def L_op(self, inputs, outputs, output_grads):
+        return self.grad(inputs, output_grads)
+
    def R_op(self, inputs, eval_points):
        """
        This method is primarily used by tensor.Rop

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -14,6 +14,7 @@ import theano
 from theano.gof import utils
 from theano.gof.utils import MethodNotDefined, object2
 from theano.gof import graph
+from theano.configparser import change_flags

 ########
 # Type #
@@ -638,6 +639,8 @@ class CDataType(Type):
        have a `void` return and take a single pointer argument.

    """
+    __props__ = ('ctype', 'freefunc', 'headers', 'header_dirs',
+                 'libraries', 'lib_dirs', 'extra_support_code')

    def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None,
                 libraries=None, lib_dirs=None, extra_support_code=""):
@@ -647,42 +650,51 @@ class CDataType(Type):
            assert isinstance(freefunc, string_types)
        self.freefunc = freefunc
        if headers is None:
-            headers = []
-        self.headers = headers
+            headers = ()
+        self.headers = tuple(headers)
        if header_dirs is None:
-            header_dirs = []
-        self.header_dirs = header_dirs
+            header_dirs = ()
+        self.header_dirs = tuple(header_dirs)
        if libraries is None:
-            libraries = []
-        self.libraries = libraries
+            libraries = ()
+        self.libraries = tuple(libraries)
        if lib_dirs is None:
-            lib_dirs = []
-        self.lib_dirs = lib_dirs
+            lib_dirs = ()
+        self.lib_dirs = tuple(lib_dirs)
        self.extra_support_code = extra_support_code
        self._fn = None

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.ctype == other.ctype and
-                self.freefunc == other.freefunc)
-
-    def __hash__(self):
-        return hash((type(self), self.ctype, self.freefunc))
-
    def filter(self, data, strict=False, allow_downcast=None):
        if data is not None and not isinstance(data, _cdata_type):
            raise TypeError("expected None or a PyCapsule")
        return data

    def _get_func(self):
+        """
+        Return a function that makes a value from an integer.
+
+        The integer value is assumed to be a valid pointer for the
+        type and no check is done to ensure that.
+        """
        from theano.scalar import get_scalar_type

        if self._fn is None:
-            v = get_scalar_type('int64')()
-            self._fn = theano.function([v], _make_cdata(self)(v), profile=False)
+            with change_flags(compute_test_value='off'):
+                v = get_scalar_type('int64')()
+                self._fn = theano.function([v], _make_cdata(self)(v),
+                                           profile=False)
        return self._fn

    def make_value(self, ptr):
+        """
+        Make a value of this type.
+
+        Parameters
+        ----------
+        ptr : int
+            Integer representation of a valid pointer value
+
+        """
        return self._get_func()(ptr)

    def c_declare(self, name, sub, check_input=True):

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -26,7 +26,7 @@ except ImportError:
 # This is for documentation not to depend on the availability of pygpu
 from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
-                   reg_context, get_context, ContextNotDefined)
+                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
 from . import fft, dnn, opt, nerv, extra_ops, multinomial

@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
              (name, dev, context.devname),
              file=sys.stderr)
    pygpu_activated = True
+    ctx_props = _get_props(name)
+    ctx_props['dev'] = dev
    if dev.startswith('cuda'):
-        try:
-            cudnn_version = dnn.version()
-            # 5200 should not print warning with cudnn 5.1 final.
-            if cudnn_version >= 5200:
-                warnings.warn("Your cuDNN version is more recent than Theano."
-                              " If you see problems, try updating Theano or"
-                              " downgrading cuDNN to version 5.1.")
-            if config.print_active_device:
-                print("Using cuDNN version %d on context %s" %
-                      (cudnn_version, name), file=sys.stderr)
-        except Exception:
-            pass
+        if 'cudnn_version' not in ctx_props:
+            try:
+                ctx_props['cudnn_version'] = dnn.version()
+                # 5200 should not print warning with cudnn 5.1 final.
+                if ctx_props['cudnn_version'] >= 5200:
+                    warnings.warn("Your cuDNN version is more recent than "
+                                  "Theano. If you encounter problems, try "
+                                  "updating Theano or downgrading cuDNN to "
+                                  "version 5.1.")
+                if config.print_active_device:
+                    print("Using cuDNN version %d on context %s" %
+                          (ctx_props['cudnn_version'], name), file=sys.stderr)
+                ctx_props['cudnn_handle'] = dnn._make_handle(context)
+            except Exception:
+                pass

 # This maps things like 'cuda0' to the context object on that device.
 init_dev.devmap = {}

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
 from __future__ import absolute_import, print_function, division
+import ctypes
 import os
+import sys
 import warnings

 import numpy
@@ -7,7 +9,7 @@ from six import integer_types

 import theano
 from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import as_scalar, constant, Log
+from theano.scalar import as_scalar, constant, Log, get_scalar_type
 from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
@@ -26,7 +28,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts
+from .type import (get_context, gpu_context_type, list_contexts,
+                   get_prop, set_prop, GpuArraySharedVariable)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, gpu_alloc_empty,
                        empty_like, GpuArrayType)
@@ -42,6 +45,45 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER

+try:
+    from pygpu import gpuarray
+except ImportError:
+    pass
+
+
+def _dnn_lib():
+    if _dnn_lib.handle is None:
+        import ctypes.util
+
+        lib_name = ctypes.util.find_library('cudnn')
+        if lib_name is None and sys.platform == 'win32':
+            # Update these names when new versions of cudnn are supported.
+            for name in ['cudnn64_5.dll', 'cudnn64_4.dll']:
+                lib_name = ctypes.util.find_library(name)
+                if lib_name:
+                    break
+        if lib_name is None:
+            raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])')
+        _dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
+        cudnn = _dnn_lib.handle
+        cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
+        cudnn.cudnnCreate.restype = ctypes.c_int
+        cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
+        cudnn.cudnnDestroy.restype = ctypes.c_int
+    return _dnn_lib.handle
+
+_dnn_lib.handle = None
+
+
+def _make_handle(ctx):
+    cudnn = _dnn_lib()
+    handle = ctypes.c_void_p()
+    with ctx:
+        err = cudnn.cudnnCreate(ctypes.byref(handle))
+    if err != 0:
+        raise RuntimeError("error creating cudnn handle")
+    return handle
+

 def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
    raise RuntimeError(msg)
@@ -144,6 +186,12 @@ def dnn_available(context_name):

 dnn_available.msg = None

+handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
+                        headers=['cudnn.h'],
+                        header_dirs=[config.dnn.include_path],
+                        libraries=['cudnn'],
+                        lib_dirs=[config.dnn.library_path])
+

 class DnnBase(COp):

@@ -154,10 +202,20 @@ class DnnBase(COp):
    # dnn does not know about broadcasting, so we do not need to assert
    # the input broadcasting pattern.
    check_broadcast = False
-    params_type = gpu_context_type
+    params_type = handle_type
+
+    def dnn_context(self, node):
+        return node.outputs[0].type.context_name

    def get_params(self, node):
-        return node.outputs[0].type.context
+        try:
+            return get_prop(self.dnn_context(node), 'cudnn_handle_param')
+        except KeyError:
+            pass
+        ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
+        res = handle_type.make_value(ptr)
+        set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
+        return res

    def __init__(self, files=None, c_func=None):
        if files is None:
@@ -165,9 +223,10 @@ class DnnBase(COp):
        COp.__init__(self, ["dnn_base.c"] + files, c_func)

    def c_headers(self):
-        return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
-                'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
-                'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
+                'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
+                'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
+                'gpuarray_helper.h']

    def c_header_dirs(self):
        return [os.path.dirname(__file__), pygpu.get_include(),
@@ -183,7 +242,7 @@ class DnnBase(COp):
        return ['-Wl,-rpath,' + config.dnn.library_path]

    def c_code_cache_version(self):
-        return (super(DnnBase, self).c_code_cache_version(), version())
+        return (super(DnnBase, self).c_code_cache_version(), version(), 1)


 class DnnVersion(Op):
@@ -1734,6 +1793,599 @@ class GpuDnnBatchNormGrad(DnnBase):
    def infer_shape(self, node, shape):
        return [shape[0], shape[2], shape[2]]

+gpudata_type = CDataType('gpudata *', 'gpudata_release')
+dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
+                             'cudnnDestroyDropoutDescriptor')
+
+
+class GpuDnnDropoutOp(DnnBase):
+    __props__ = ('inplace',)
+
+    def __init__(self, inplace=False):
+        DnnBase.__init__(self, ["dnn_dropout_fwd.c"], "dnn_dropout_fwd")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {1: [2]}
+
+    def make_node(self, inp, descriptor, state):
+        ctx_name = infer_context_name(inp)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        return Apply(self, [inp, descriptor, state],
+                     [inp.type(), state.type(), gpudata_type()])
+
+    def prepare_node(self, node, storage_map, compute_map):
+        assert self.inplace, "GpuDnnDropoutOp not inplace"
+
+
+class _DropoutDescriptor(DnnBase):
+    __props__ = ('context_name',)
+
+    def __init__(self, context_name):
+        DnnBase.__init__(self, ["dnn_dropout_desc.c"], "dnn_dropout_desc")
+        self.context_name = context_name
+
+    def dnn_context(self, node):
+        return self.context_name
+
+    def do_constant_folding(self, node):
+        return False
+
+    def make_node(self, dropout, seed, context_name):
+        dropout = as_scalar(dropout).astype('float32')
+        seed = as_scalar(seed).astype('uint64')
+
+        assert context_name == self.context_name
+        # This is a dirty hack to pass the context because params is
+        # occupied by the cudnn handle
+        context = gpu_context_type.make_constant(get_context(context_name))
+
+        return Apply(self, [dropout, seed, context],
+                     [dropoutdesc_type(),
+                      GpuArrayType('uint8', (False,),
+                                   context_name=context_name)()])
+
+    def c_code_cache_version_apply(self, node):
+        # disable the cache since we can't pickle contexts
+        return None
+
+
+def _make_dropout_desc(dropout, seed, context_name):
+    desc, states = theano.function(
+        [],
+        _DropoutDescriptor(context_name)(dropout, seed, context_name),
+        theano.Mode(optimizer=None),
+        profile=False)()
+    return desc, states
+
+
+def dropout(x, dropout=0.0, seed=4242):
+    desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
+    y, odesc = GpuDnnDropoutOp()(x, desc)
+    return y, desc, odesc, states
+
+rnndesc_type = CDataType('cudnnRNNDescriptor_t',
+                         'cudnnDestroyRNNDescriptor')
+
+
+def as_i32(v):
+    return as_scalar(v).astype('int32')
+
+
+class _RNNDescriptor(DnnBase):
+    __props__ = ('context_name',)
+
+    def __init__(self, context_name):
+        if version() < 5005:
+            raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
+        DnnBase.__init__(self, ["dnn_rnn_desc.c"], "dnn_rnn_desc")
+        self.context_name = context_name
+
+    def dnn_context(self, node):
+        return self.context_name
+
+    def do_constant_folding(self, node):
+        return False
+
+    def make_node(self, hidden_size, num_layers, ddesc, input_mode,
+                  direction_mode, rnn_mode, dtype):
+
+        hidden_size = as_i32(hidden_size)
+        num_layers = as_i32(num_layers)
+
+        assert 5000 < version() < 5200, "Constants only work for cudnn 5, 5.1"
+
+        if input_mode == 'linear':
+            input_mode = as_i32(0)
+        elif input_mode == 'skip':
+            input_mode = as_i32(1)
+        else:
+            raise ValueError("input_mode")
+
+        if direction_mode == 'unidirectional':
+            direction_mode = as_i32(0)
+        elif direction_mode == 'bidirectional':
+            direction_mode = as_i32(1)
+        else:
+            raise ValueError("direction_mode")
+
+        if rnn_mode == 'rnn_relu':
+            rnn_mode = as_i32(0)
+        elif rnn_mode == 'rnn_tanh':
+            rnn_mode = as_i32(1)
+        elif rnn_mode == 'lstm':
+            rnn_mode = as_i32(2)
+        elif rnn_mode == 'gru':
+            rnn_mode = as_i32(3)
+        else:
+            raise ValueError("rnn_mode")
+
+        dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
+
+        return Apply(self, [hidden_size, num_layers,
+                            dropoutdesc_type.make_constant(ddesc),
+                            input_mode, direction_mode, rnn_mode, dtype],
+                     [rnndesc_type()])
+
+
+def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
+                   input_mode, direction_mode, dtype, context_name):
+    desc = theano.function(
+        [],
+        _RNNDescriptor(context_name)(hidden_size, num_layers, ddesc,
+                                     input_mode, direction_mode,
+                                     rnn_mode, dtype),
+        theano.Mode(optimizer=None),
+        profile=False)()
+    return desc
+
+
+class _RNNParamSize(DnnBase):
+    __props__ = ('context_name',)
+
+    def __init__(self, context_name):
+        DnnBase.__init__(self, ["dnn_rnn_paramsize.c"],
+                         "dnn_rnn_paramsize")
+        self.context_name = context_name
+
+    def dnn_context(self, node):
+        return self.context_name
+
+    def do_constant_folding(self, node):
+        return False
+
+    def make_node(self, desc, input_size, typecode):
+        input_size = as_tensor_variable(input_size).astype('uint64')
+        typecode = as_i32(typecode)
+        return Apply(self, [rnndesc_type.make_constant(desc), input_size,
+                            typecode],
+                     [get_scalar_type('uint64')()])
+
+
+def _get_param_size(desc, input_size, dtype, context_name):
+    typecode = gpuarray.dtype_to_typecode(dtype)
+    return theano.function(
+        [],
+        _RNNParamSize(context_name)(desc, input_size, typecode),
+        theano.Mode(optimizer=None),
+        profile=False)()
+
+
+class _RNNSplitParams(DnnBase):
+    __props__ = ('rnn_mode',)
+
+    def __init__(self, rnn_mode):
+        DnnBase.__init__(self)
+        self.rnn_mode = rnn_mode
+
+    def make_node(self, w, desc, layer, isize, typecode):
+        w = as_gpuarray_variable(w, infer_context_name(w))
+        assert w.ndim == 1
+        layer = as_scalar(layer).astype('int32')
+        isize = as_tensor_variable(isize).astype('uint64')
+        assert isize.ndim == 1
+        typecode = as_scalar(typecode).astype('int32')
+        _1d = GpuArrayType(w.type.dtype, [False],
+                           context_name=w.type.context_name)
+        _2d = GpuArrayType(w.type.dtype, [False, False],
+                           context_name=w.type.context_name)
+        outputs = []
+        if self.rnn_mode == 'rnn_relu' or self.rnn_mode == 'rnn_tanh':
+            outputs.extend([_2d(), _1d()])  # input
+            outputs.extend([_2d(), _1d()])  # recurrent
+        elif self.rnn_mode == 'lstm':
+            outputs.extend([_2d(), _1d()])  # input input
+            outputs.extend([_2d(), _1d()])  # input forget
+            outputs.extend([_2d(), _1d()])  # input newmem
+            outputs.extend([_2d(), _1d()])  # input output
+            outputs.extend([_2d(), _1d()])  # recur input
+            outputs.extend([_2d(), _1d()])  # recur forget
+            outputs.extend([_2d(), _1d()])  # recur newmem
+            outputs.extend([_2d(), _1d()])  # recur output
+        elif self.rnn_mode == 'gru':
+            outputs.extend([_2d(), _1d()])  # input reset
+            outputs.extend([_2d(), _1d()])  # input update
+            outputs.extend([_2d(), _1d()])  # input newmem
+            outputs.extend([_2d(), _1d()])  # recur reset
+            outputs.extend([_2d(), _1d()])  # recur update
+            outputs.extend([_2d(), _1d()])  # recur newmem
+
+        return Apply(self, [w, layer, rnndesc_type.make_constant(desc),
+                            isize, typecode], outputs)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        kw = dict(fail=sub['fail'], w=inputs[0], layer=inputs[1],
+                  desc=inputs[2], isize=inputs[3], typecode=inputs[4],
+                  handle=sub['params'])
+        code = """
+  cudnnTensorDescriptor_t xdesc;
+  cudnnFilterDescriptor_t wdesc;
+  cudnnFilterDescriptor_t odesc;
+  size_t nshp[2];
+  void *w;
+  void *o;
+  ptrdiff_t off;
+  size_t bshp;
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  cudnnTensorFormat_t tf;
+  int nd;
+  int dims[3];
+  int strs[3];
+
+  if (PyArray_DIM(%(isize)s, 0) != 2) {
+    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
+    %(fail)s;
+  }
+
+  switch (%(typecode)s) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    %(fail)s;
+  }
+
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create xdesc");
+    %(fail)s;
+  }
+
+  dims[0] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 0);
+  dims[1] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 1);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %%s",
+                 cudnnGetErrorString(err));
+    %(fail)s;
+  }
+
+  if (c_make_filter(%(w)s, &wdesc)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    %(fail)s
+  }
+
+  err = cudnnCreateFilterDescriptor(&odesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "could not create odesc");
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyFilterDescriptor(wdesc);
+    %(fail)s
+  }
+
+  w = PyGpuArray_DEV_DATA(%(w)s);
+  nshp[0] = PyGpuArray_DIM(%(w)s, 0);
+  nshp[1] = 1;
+        """ % kw
+
+        def get_params(id, m, b):
+            kw2 = kw.copy()
+            kw2['id'] = id
+            kw2['m'] = m
+            kw2['b'] = b
+            return """
+  err = cudnnGetRNNLinLayerBiasParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyFilterDescriptor(wdesc);
+    cudnnDestroyFilterDescriptor(odesc);
+    PyErr_SetString(PyExc_RuntimeError, "can't fetch bias for id %(id)s");
+    %(fail)s
+  }
+  off = (intptr_t)o - (intptr_t)w;
+  assert(off >= 0 && "bias");
+
+  err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyFilterDescriptor(wdesc);
+    cudnnDestroyFilterDescriptor(odesc);
+    PyErr_SetString(PyExc_RuntimeError, "could not get bias shape for id %(id)s");
+    %(fail)s;
+  }
+  // We assume that the typecode matches
+  assert(dims[2] == 1);
+  assert(dims[1] == 1);
+  %(b)s = pygpu_view(%(w)s, Py_None);
+  %(b)s->ga.offset = off;
+  %(b)s->ga.dimensions[0] = dims[0];
+  bshp = dims[0];
+
+  err = cudnnGetRNNLinLayerMatrixParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyFilterDescriptor(wdesc);
+    cudnnDestroyFilterDescriptor(odesc);
+    PyErr_SetString(PyExc_RuntimeError, "can't fetch matrix for id %(id)s");
+    %(fail)s
+  }
+  off = (intptr_t)o - (intptr_t)w;
+  assert(off >= 0 && "matrix");
+
+  // This is 3d because of cudnn limitations.
+  err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyFilterDescriptor(wdesc);
+    cudnnDestroyFilterDescriptor(odesc);
+    PyErr_SetString(PyExc_RuntimeError, "could not get matrix shape for id %(id)s");
+    %(fail)s;
+  }
+
+  assert(dims[1] == 1);
+  assert(dims[2] == 1);
+  // We assume that the typecode matches
+  %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
+  %(m)s->ga.offset = off;
+  assert(dims[0] %% bshp == 0);
+  %(m)s->ga.dimensions[0] = dims[0] / bshp;
+  %(m)s->ga.dimensions[1] = bshp;
+  %(m)s->ga.strides[1] = %(m)s->ga.dimensions[0] * gpuarray_get_elsize(%(m)s->ga.typecode);
+            """ % kw2
+
+        for i in range(len(outputs) // 2):
+            code += get_params(i, outputs[2 * i], outputs[(2 * i) + 1])
+
+        code += """
+  cudnnDestroyTensorDescriptor(xdesc);
+  cudnnDestroyFilterDescriptor(wdesc);
+  cudnnDestroyFilterDescriptor(odesc);
+        """
+        return code
+
+    def c_code_cache_version(self):
+        return (2,)
+
+
+def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
+    typecode = gpuarray.dtype_to_typecode(dtype)
+    outs = _RNNSplitParams(rnn_mode)(w, desc, layer, input_size, typecode)
+    outs = [theano.Out(o, borrow=True) for o in outs]
+    return theano.function(
+        [], outs,
+        theano.Mode(optimizer=None),
+        profile=False)()
+
+
+class GpuDnnRNNOp(DnnBase):
+    __props__ = ()
+    _cop_num_inputs = 5
+    _cop_num_outputs = 4
+
+    def __init__(self, rnn_mode, direction_mode):
+        DnnBase.__init__(self, ["dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
+        self.rnn_mode = rnn_mode
+        if direction_mode == 'bidirectional':
+            self.num_dirs = 2
+        elif direction_mode == 'unidirectional':
+            self.num_dirs = 1
+        else:
+            raise ValueError('direction_mode is invalid (got %s)' % (direction_mode,))
+
+    def dnn_context(self, node):
+        return node.outputs[1].type.context_name
+
+    def make_node(self, desc, w, x, hx, cx=None):
+        if cx is None:
+            context_name = infer_context_name(w, x, hx)
+        else:
+            context_name = infer_context_name(w, x, hx, cx)
+
+        w = as_gpuarray_variable(w, context_name)
+        x = as_gpuarray_variable(x, context_name)
+        hx = as_gpuarray_variable(hx, context_name)
+        inputs = [desc, w, x, hx]
+        assert w.ndim == 1
+        assert x.ndim == 3  # seqLength, minibatch, inputSize
+        assert hx.ndim == 3  # numLayers, minibatch, hiddenSize * bidi
+
+        if self.rnn_mode == 'lstm':
+            cx = as_gpuarray_variable(cx, context_name)
+            assert cx.ndim == 3  # numLayers, minibatch, hiddenSize * bidi
+            inputs.append(cx)
+
+        _3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
+                           context_name=context_name)
+        reserve = gpudata_type()
+        y = _3d()  # seqLength, minibatch, hiddenSize * bidi
+        hy = _3d()  # numLayers, miniBatch, hiddenSize * bidi
+        outputs = [reserve, y, hy]
+
+        if self.rnn_mode == 'lstm':
+            cy = _3d()  # numLayers, miniBatch, hiddenSize * bidi
+            outputs.append(cy)
+
+        return Apply(self, inputs, outputs)
+
+    def L_op(self, inputs, outputs, output_grads):
+        desc, w, x, hx = inputs[:4]
+        cx = inputs[4] if len(inputs) == 5 else None
+        reserve, y, hy = outputs[:3]
+        _, dy, dhy = output_grads[:3]
+        dcy = output_grads[3] if len(output_grads) == 4 else None
+        # Since the op return two outputs which contain essentially
+        # the same information, the user will most likely only use one
+        # of them. This leads to the situation that the other is
+        # considered "disconnected" by theano in the gradient.
+        # However we know that this isn't really the case so we fix it
+        # here.
+
+        # If all the ys are disconnected, then you get a boring
+        # gradient instead of an error.  But in that case you
+        # shouldn't call this method anyway.
+        if isinstance(dy.type, DisconnectedType):
+            dy = as_gpuarray_variable(y.zeros_like(),
+                                      context_name=y.type.context_name)
+        if isinstance(dhy.type, DisconnectedType):
+            dhy = None
+        if dcy and isinstance(dcy.type, DisconnectedType):
+            dcy = None
+        dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode,
+                                      grad_h=(dhy is not None),
+                                      grad_c=(dcy is not None))(
+            desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
+        reserve2, dx, dhx = dinputs[:3]
+        dw = GpuDnnRNNGradWeights()(
+            desc, x, hx, y, reserve2, w)
+        res = [DisconnectedType()(), dw, dx, dhx]
+        if cx is not None:
+            res.append(dinputs[3])  # dcx
+        return res
+
+    def connection_pattern(self, node):
+        deconn = [[False] * len(node.outputs)]
+        conn = [[True] * len(node.outputs)] * (len(node.inputs) - 1)
+        return deconn + conn
+
+
+class GpuDnnRNNGradInputs(DnnBase):
+    __props__ = ('rnn_mode', 'grad_c', 'grad_h')
+    _cop_num_inputs = 10
+    _cop_num_outputs = 4
+
+    def __init__(self, rnn_mode, grad_h, grad_c):
+        DnnBase.__init__(self, ['dnn_rnn_gi.c'], 'dnn_rnn_gi')
+        self.rnn_mode = rnn_mode
+        self.grad_h = grad_h
+        self.grad_c = grad_c
+        if self.grad_c:
+            assert self.rnn_mode == 'lstm'
+
+    def dnn_context(self, node):
+        return node.outputs[1].type.context_name
+
+    def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
+        # We trust the callers here
+        xshp = as_scalar(x.shape[2]).astype('uint64')
+        inputs = [desc, xshp, y, dy, w, hx, reserve]
+        outputs = [reserve.type(), x.type(), hx.type()]
+        if self.rnn_mode == 'lstm':
+            inputs.append(cx)
+            outputs.append(cx.type())
+        if self.grad_h:
+            inputs.append(dhy)
+        if self.grad_c:
+            inputs.append(dcy)
+
+        return Apply(self, inputs, outputs)
+
+    # We have special requirements so this is hooking into COp
+    def format_c_function_args(self, inp, out):
+        rinp = inp[:7]
+        others = inp[7:]
+        if self.rnn_mode == 'lstm':
+            rinp.append(others.pop(0))
+        else:
+            rinp.append('NULL')
+        if self.grad_h:
+            rinp.append(others.pop(0))
+        else:
+            rinp.append('NULL')
+        if self.grad_c:
+            rinp.append(others.pop(0))
+        else:
+            rinp.append('NULL')
+        assert len(others) == 0
+        return COp.format_c_function_args(self, rinp, out)
+
+
+class GpuDnnRNNGradWeights(DnnBase):
+    __props__ = ()
+
+    def __init__(self):
+        DnnBase.__init__(self, ['dnn_rnn_gw.c'], 'dnn_rnn_gw')
+
+    def make_node(self, desc, x, hx, y, reserve, w):
+        # We trust the callers here
+        wsize = as_scalar(w.shape[0]).astype('uint64')
+        inputs = [desc, wsize, x, hx, y, reserve]
+        outputs = [w.type()]
+        return Apply(self, inputs, outputs)
+
+
+class RNNBlock(object):
+    def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
+                 input_mode='linear', direction_mode='unidirectional',
+                 context_name=None):
+        """
+        dtype: data type of computation
+        hidden_size: int
+        num_layers: int
+        rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
+          See cudnn documentation for cudnnRNNMode_t.
+        input_mode: {'linear', 'skip'}
+          linear: input will be multiplied by a biased matrix
+          skip: No operation is performed on the input.  The size must match the hidden size.
+        direction_mode: {'unidirectional', 'bidirectional'}
+          unidirectional: The network operates recurrently from the
+                          first input to the last.
+
+          bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer.
+
+        """
+        # This is not supported for any value other than 0, so don't change it
+        ddesc, states = _make_dropout_desc(0, 4242, context_name)
+        self.ddesc = ddesc
+        self.dstates = states
+        self.desc = _make_rnn_desc(hidden_size, num_layers,
+                                   ddesc, rnn_mode, input_mode,
+                                   direction_mode, dtype, context_name)
+        self.rnn_mode = rnn_mode
+        self.direction_mode = direction_mode
+        self.context_name = context_name
+        self.dtype = dtype
+
+    def get_param_size(self, input_size):
+        bytesize = _get_param_size(self.desc, input_size, self.dtype,
+                                   self.context_name)
+        bytesize = int(bytesize)
+        assert bytesize % numpy.dtype(self.dtype).itemsize == 0
+        return bytesize // numpy.dtype(self.dtype).itemsize
+
+    def split_params(self, w, layer, input_size):
+        if not isinstance(w, GpuArraySharedVariable):
+            raise TypeError("split_params only works on gpuarray shared variables")
+        return _split_rnn_params(w, self.desc, layer, input_size, self.dtype, self.rnn_mode)
+
+    def apply(self, w, x, hx, cx=None):
+        # Don't return the reserve as an output
+        return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
+            rnndesc_type.make_constant(self.desc),
+            w, x, hx, cx, return_list=True)[1:]
+

 def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
                                  epsilon=1e-4):

--- a/theano/gpuarray/dnn_base.c
+++ b/theano/gpuarray/dnn_base.c
@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
 #section init_code

 setup_ext_cuda();
-
-#section support_code_struct
-
-PyGpuContextObject *ctx;
-cudnnHandle_t APPLY_SPECIFIC(_handle);
-
-#section init_code_struct
-
-{
-  // We need to keep a reference here to have it available in the destructor.
-  ctx = PARAMS;
-  Py_INCREF(ctx);
-
-  cuda_enter(PARAMS->ctx);
-  cudnnStatus_t err;
-  APPLY_SPECIFIC(_handle) = NULL;
-  if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
-                            cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
-                 cudnnGetErrorString(err));
-    cuda_exit(PARAMS->ctx);
-    FAIL;
-  }
-  cuda_exit(PARAMS->ctx);
-}
-
-#section cleanup_code_struct
-
-cuda_enter(ctx->ctx);
-cudnnDestroy(APPLY_SPECIFIC(_handle));
-cuda_exit(ctx->ctx);
-Py_DECREF((PyObject *)ctx);
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
-                     PyGpuArrayObject **x_invstd, PyGpuContextObject *c) {
+                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
+
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_batchnorm_grad.c
+++ b/theano/gpuarray/dnn_batchnorm_grad.c
@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
                       PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
                       PyGpuArrayObject *x_invstd, npy_float64 epsilon,
                       PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
-                       PyGpuArrayObject **dbias, PyGpuContextObject *c) {
+                       PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
+
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(doutp, bn_doutput) != 0)
@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
      betaParam = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alphaData,
      betaData,

--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -3,7 +3,9 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
                     PyGpuArrayObject *est_var, npy_float64 epsilon, 
-                     PyGpuArrayObject **outp, PyGpuContextObject *c) {
+                     PyGpuArrayObject **outp, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
+
  if (c_set_tensorNd(inp, bn_input) != 0)
    return 1;
  if (c_set_tensorNd(scale, bn_params) != 0)
@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      beta = (void *)&fbeta;
    }
    cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      MODE,
      alpha,
      beta,

--- a/theano/gpuarray/dnn_dropout_desc.c
+++ b/theano/gpuarray/dnn_dropout_desc.c
+#section support_code
+
+int dnn_dropout_desc(float dropout, unsigned long long seed,
+                     PyGpuContextObject *c,
+                     cudnnDropoutDescriptor_t *odesc,
+                     PyGpuArrayObject **ostates,
+                     cudnnHandle_t _handle) {
+  PyGpuArrayObject *states;
+  cudnnDropoutDescriptor_t desc;
+  size_t states_sz;
+  cudnnStatus_t err;
+
+  cuda_enter(c->ctx);
+  err = cudnnCreateDropoutDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
+    cuda_exit(c->ctx);
+    return -1;
+  }
+
+  /* Can't fail according to docs */
+  cudnnDropoutGetStatesSize(_handle, &states_sz);
+  
+  states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
+  if (states == NULL) {
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+
+  err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
+                                  PyGpuArray_DEV_DATA(states),
+                                  states_sz, seed);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
+    Py_DECREF((PyObject *)states);
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  *odesc = desc;
+  *ostates = states;
+  return 0;
+}
--- a/theano/gpuarray/dnn_dropout_fwd.c
+++ b/theano/gpuarray/dnn_dropout_fwd.c
+#section support_code
+
+int dnn_dropout_fwd(PyGpuArrayObject *x,
+                    cudnnDropoutDescriptor_t *desc,
+                    PyGpuArrayObject *state,
+                    PyGpuArrayObject **y,
+                    PyGpuArrayObject **ostate,
+                    gpudata **reserve,
+                    cudnnHandle_t _handle) {
+  PyGpuArrayContext *c = x->context;
+  cudnnTensorDescriptor_t xdesc;
+  cudnnTensorDescriptor_t ydesc;
+  gpudata *res;
+  size_t res_sz;
+  cudnnStatus_t err;
+
+  if (c_make_tensorNd(x, &xdesc))
+    return -1;
+
+  if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
+                         GA_C_ORDER, c)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+
+  if (c_make_tensorNd(y, &ydesc)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+
+  *ostate = state;
+  Py_INCREF((PyObject *)state);
+
+  /* This can't fail according to the docs */
+  err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
+  res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
+  if (res == NULL) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyTensorDescriptor(ydesc);
+    PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
+  }
+  *reserve = res;
+
+  cuda_enter(c->ctx);
+  err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
+                            ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
+                            res_sz);
+  cudnnDestroyTensorDescriptor(xdesc);
+  cudnnDestroyTensorDescriptor(ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not run dropout: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return -1;
+  }
+
+  cuda_exit(c->ctx);
+  return 0;
+}
--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
                         cudnnConvolutionDescriptor_t desc,
                         double alpha, double beta,
                         PyGpuArrayObject **output,
-                         PyGpuContextObject *c) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  float af = alpha, bf = beta;
+                         cudnnHandle_t _handle) {
+  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    int count;
    cudnnConvolutionFwdAlgoPerf_t choice;
    err = cudnnFindConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    }

    err = cudnnGetConvolutionForwardAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
      desc, APPLY_SPECIFIC(output),
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
  {
    size_t worksize;
    gpudata *workspace;
-    err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+    err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                  APPLY_SPECIFIC(input),
                                                  APPLY_SPECIFIC(kerns),
                                                  desc,
@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      // TODO: Print a warning
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;

-      err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
+      err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
                                                    APPLY_SPECIFIC(input),
                                                    APPLY_SPECIFIC(kerns),
                                                    desc,
@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnConvolutionForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      alpha_p,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
      APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
                        PyGpuArrayObject *im,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **input,
-                        PyGpuContextObject *c) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  float af = alpha, bf = beta;
+                        cudnnHandle_t _handle) {
+  PyGpuContextObject *c = kerns->context;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
    PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    cudnnConvolutionBwdDataAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(input), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    }

    err = cudnnGetConvolutionBackwardDataAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(input),
      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardDataWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(input), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardData(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
                        PyGpuArrayObject *km,
                        cudnnConvolutionDescriptor_t desc,
                        double alpha, double beta, PyGpuArrayObject **kerns,
-                        PyGpuContextObject *c) {
-  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
-  float af = alpha, bf = beta;
+                        cudnnHandle_t _handle) {
+  PyGpuContextObject *c = input->context;
  void *alpha_p;
  void *beta_p;
+  float af = alpha, bf = beta;
+  cudnnStatus_t err = CUDNN_STATUS_SUCCESS;

  if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
    PyErr_SetString(PyExc_ValueError,
@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    cudnnConvolutionBwdFilterAlgoPerf_t choice;

    err = cudnnFindConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
      APPLY_SPECIFIC(kerns), 1, &count, &choice);

    if (err != CUDNN_STATUS_SUCCESS) {
@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    }

    err = cudnnGetConvolutionBackwardFilterAlgorithm(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
+      _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
      desc, APPLY_SPECIFIC(kerns),
      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
    if (err != CUDNN_STATUS_SUCCESS) {
@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  gpudata *workspace;

  err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
+    _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
    APPLY_SPECIFIC(kerns), algo, &worksize);

  if (err != CUDNN_STATUS_SUCCESS) {
@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

  err = cudnnConvolutionBackwardFilter(
-    APPLY_SPECIFIC(_handle),
+    _handle,
    alpha_p,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
    APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
                             PyArrayObject *stride,
                             PyArrayObject *pad,
                             PyGpuArrayObject **out,
-                             PyGpuContextObject *c) {
-  cudnnStatus_t err;
+                             cudnnHandle_t _handle) {
+  PyGpuContextObject *c = img->context;
  size_t dims[5];
+  cudnnStatus_t err;

  if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingForward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
      beta,

--- a/theano/gpuarray/dnn_pool_grad.c
+++ b/theano/gpuarray/dnn_pool_grad.c
@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
                                  PyArrayObject *stride,
                                  PyArrayObject *pad,
                                  PyGpuArrayObject **inp_grad,
-                                  PyGpuContextObject *c) {
+                                  cudnnHandle_t _handle) {
+  PyGpuContextObject *c = inp->context;
  cudnnStatus_t err;

  if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
    cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnPoolingBackward(
-      APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
+      _handle, APPLY_SPECIFIC(pool),
      alpha,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
      APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),

--- a/theano/gpuarray/dnn_rnn_desc.c
+++ b/theano/gpuarray/dnn_rnn_desc.c
+#section support_code
+
+int dnn_rnn_desc(int hidden_size, int num_layers,
+                 cudnnDropoutDescriptor_t ddesc,
+                 int input_mode, int direction_mode, int rnn_mode,
+                 int dtype, cudnnRNNDescriptor_t *odesc,
+                 cudnnHandle_t _handle) {
+  cudnnRNNDescriptor_t desc;
+  cudnnDataType_t data_type;
+  cudnnStatus_t err;
+
+  switch (dtype) {
+  case GA_FLOAT:
+    data_type = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    data_type = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    data_type = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+
+  err = cudnnCreateRNNDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
+    return -1;
+  }
+
+  err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
+                              (cudnnRNNInputMode_t)input_mode,
+                              (cudnnDirectionMode_t)direction_mode,
+                              (cudnnRNNMode_t)rnn_mode, data_type);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyRNNDescriptor(desc);
+    PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
+    return -1;
+  }
+
+  *odesc = desc;
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_fwd.c
+++ b/theano/gpuarray/dnn_rnn_fwd.c
+#section support_code
+
+int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
+                PyGpuArrayObject *w, PyGpuArrayObject *x,
+                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
+                gpudata **reserve, PyGpuArrayObject **y,
+                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
+                cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t hydesc = NULL;
+  cudnnTensorDescriptor_t cydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+
+  size_t seqLength = PyGpuArray_DIM(x, 0);
+  size_t miniBatch = PyGpuArray_DIM(x, 1);
+  size_t inputSize = PyGpuArray_DIM(x, 2);
+  size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
+  size_t shape[3];
+  int strs[3], dims[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+
+  int res = -1;
+
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+
+  strs[0] = dims[1] * dims[2];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = hiddenSizeDir;
+  if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
+                         hx->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+
+  if (c_make_tensorNd(*hy, &hydesc) != 0)
+    goto fail;
+
+  if (cy != NULL) {
+    if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
+                           cx->ga.typecode, GA_C_ORDER, c) != 0)
+      goto fail;
+
+    if (c_make_tensorNd(*cy, &cydesc) != 0)
+      goto fail;
+  }
+
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < seqLength; i++)
+    xl[i] = xdesc;
+
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*reserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+
+  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                                wdesc, PyGpuArray_DEV_DATA(w),
+                                yl, PyGpuArray_DEV_DATA(*y),
+                                hydesc, PyGpuArray_DEV_DATA(*hy),
+                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
+                                *(void **)workspace, worksize,
+                                *(void **)(*reserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  res = 0;
+ fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (hydesc != NULL)
+    cudnnDestroyTensorDescriptor(hydesc);
+  if (cydesc != NULL)
+    cudnnDestroyTensorDescriptor(cydesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gi.c
+++ b/theano/gpuarray/dnn_rnn_gi.c
+#section support_code
+
+int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
+               PyGpuArrayObject *y, PyGpuArrayObject *dy,
+               PyGpuArrayObject *w, PyGpuArrayObject *hx,
+               gpudata *reserve, PyGpuArrayObject *cx,
+               PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
+               gpudata **oreserve, PyGpuArrayObject **dx,
+               PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
+               cudnnHandle_t _handle) {
+  PyGpuContextObject *c = y->context;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t dhydesc = NULL;
+  cudnnTensorDescriptor_t dcydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t dxdesc = NULL;
+  cudnnTensorDescriptor_t dhxdesc = NULL;
+  cudnnTensorDescriptor_t dcxdesc = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  cudnnTensorDescriptor_t *dxl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+
+  size_t seqLength = PyGpuArray_DIM(y, 0);
+  size_t miniBatch = PyGpuArray_DIM(y, 1);
+  size_t inputSize = xshp;
+  size_t shape[3];
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+
+  switch (y->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
+    return -1;
+  }
+
+  cuda_enter(c->ctx);
+
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (dhy != NULL)
+    if (c_make_tensorNd(dhy, &dhydesc) != 0)
+      goto fail;
+
+  if (dcy != NULL)
+    if (c_make_tensorNd(dcy, &dcydesc) != 0)
+      goto fail;
+  
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = inputSize;
+  if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+
+  err = cudnnCreateTensorDescriptor(&dxdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    goto fail;
+
+  if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
+    goto fail;
+
+  if (cx != NULL) {
+    if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
+                           GA_C_ORDER, c) != 0)
+      goto fail;
+
+    if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
+      goto fail;
+  }
+
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  
+  dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (dxl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < seqLength; i++)
+    dxl[i] = dxdesc;
+
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       dxl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  *oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*oreserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+
+  if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
+    goto fail;
+  }
+
+  err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
+                             yl, PyGpuArray_DEV_DATA(y),
+                             /* y and dy are the same shape */
+                             yl, PyGpuArray_DEV_DATA(dy),
+                             dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
+                             dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
+                             wdesc, PyGpuArray_DEV_DATA(w),
+                             hxdesc, PyGpuArray_DEV_DATA(hx),
+                             cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                             dxl, PyGpuArray_DEV_DATA(*dx),
+                             dhxdesc, PyGpuArray_DEV_DATA(*dhx),
+                             dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
+                             *(void **)workspace, worksize,
+                             *(void **)(*oreserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad inputs: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  res = 0;
+fail:
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dhydesc != NULL)
+    cudnnDestroyTensorDescriptor(dhydesc);
+  if (dcydesc != NULL)
+    cudnnDestroyTensorDescriptor(dcydesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (dxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dxdesc);
+  if (dhxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dhxdesc);
+  if (dcxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dcxdesc);
+  free(yl);
+  free(dxl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gw.c
+++ b/theano/gpuarray/dnn_rnn_gw.c
+#section support_code
+
+int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
+               PyGpuArrayObject *x, PyGpuArrayObject *hx,
+               PyGpuArrayObject *y, gpudata *reserve,
+               PyGpuArrayObject **dw, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnFilterDescriptor_t dwdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+
+  size_t iters = PyGpuArray_DIM(x, 0);
+  size_t wsize = _wsize;
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  GpuArray_memset(&(*dw)->ga, 0);
+
+  if (c_make_filter(*dw, &dwdesc) != 0)
+    goto fail;
+
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < iters; i++)
+    xl[i] = xdesc;
+
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+
+  for (size_t i = 0; i < iters; i++)
+    yl[i] = ydesc;
+
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                yl, PyGpuArray_DEV_DATA(y),
+                                *(void **)workspace, worksize,
+                                dwdesc, PyGpuArray_DEV_DATA(*dw),
+                                *(void **)reserve, ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad weights: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+
+  res = 0;
+fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dwdesc != NULL)
+    cudnnDestroyFilterDescriptor(dwdesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_paramsize.c
+++ b/theano/gpuarray/dnn_rnn_paramsize.c
+#section support_code
+
+int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
+                      PyArrayObject *isize,
+                      npy_int32 typecode,
+                      npy_uint64 *oparam_size,
+                      cudnnHandle_t _handle) {
+  cudnnTensorDescriptor_t xdesc;
+  size_t param_size;
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int shape[3];
+  int strides[3];
+
+  if (PyArray_DIM(isize, 0) != 2) {
+    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
+    return -1;
+  }
+
+  switch (typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
+    return -1;
+  }
+
+  shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
+  shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
+  shape[2] = 1;
+  strides[0] = shape[2] * shape[1];
+  strides[1] = shape[2];
+  strides[2] = 1;
+
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+
+  err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
+    return -1;
+  }
+
+  cudnnDestroyTensorDescriptor(xdesc);
+  *oparam_size = param_size;
+  return 0;
+}
--- a/theano/gpuarray/dnn_softmax.c
+++ b/theano/gpuarray/dnn_softmax.c
@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)

 int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
                            PyGpuArrayObject **out,
-                            PyGpuContextObject *c) {
+                            cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
  cudnnStatus_t err;

  if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
    cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnSoftmaxForward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,

--- a/theano/gpuarray/dnn_softmax_grad.c
+++ b/theano/gpuarray/dnn_softmax_grad.c
@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
 int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
                                 PyGpuArrayObject *sm,
                                 PyGpuArrayObject **dx,
-                                 PyGpuContextObject *c) {
+                                 cudnnHandle_t _handle) {
+  PyGpuContextObject *c = dy->context;
  cudnnStatus_t err;

  if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
    cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);

    err = cudnnSoftmaxBackward(
-      APPLY_SPECIFIC(_handle),
+      _handle,
      SOFTMAX_ALGO,
      SOFTMAX_MODE,
      alpha,

--- a/theano/gpuarray/tests/rnn_support.py
+++ b/theano/gpuarray/tests/rnn_support.py
+from __future__ import absolute_import, print_function, division
+
+import theano
+import theano.tensor as T
+import numpy
+
+
+class Model(object):
+    def __init__(self, name=""):
+        self.name = name
+        self.layers = []
+        self.params = []
+        self.other_updates = {}
+
+    def add_layer(self, layer):
+        self.layers.append(layer)
+        for p in layer.params:
+            self.params.append(p)
+
+        if hasattr(layer, 'other_updates'):
+            for y in layer.other_updates:
+                self.other_updates[y[0]] = y[1]
+
+    def get_params(self):
+        return self.params
+
+
+def uniform(stdev, size):
+    """uniform distribution with the given stdev and size"""
+    return numpy.random.uniform(
+        low=-stdev * numpy.sqrt(3),
+        high=stdev * numpy.sqrt(3),
+        size=size
+    ).astype(theano.config.floatX)
+
+
+def linear_transform_weights(input_dim, output_dim,
+                             param_list=None, name=""):
+    "theano shared variable given input and output dimension"
+    weight_inialization = uniform(numpy.sqrt(2.0 / input_dim),
+                                  (input_dim, output_dim))
+    W = theano.shared(weight_inialization, name=name)
+
+    assert(param_list is not None)
+
+    param_list.append(W)
+    return W
+
+
+def bias_weights(length, param_list=None, name=""):
+    "theano shared variable for bias unit, given length"
+    bias_initialization = numpy.zeros(length).astype(theano.config.floatX)
+
+    bias = theano.shared(
+        bias_initialization,
+        name=name
+        )
+
+    if param_list is not None:
+        param_list.append(bias)
+
+    return bias
+
+
+class Layer(object):
+    '''Generic Layer Template which all layers should inherit'''
+    def __init__(self, name=""):
+        self.name = name
+        self.params = []
+
+    def get_params(self):
+        return self.params
+
+
+class GRU(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
+        '''Layers information'''
+        self.name = name
+        self.input_dim = input_dim
+        self.hidden_dim = output_dim
+        self.output_dim = output_dim
+        self.input_layer = input_layer
+        self.X = input_layer.output()
+        self.s0 = s0
+        self.params = []
+
+        '''Layers weights'''
+
+        '''self.params is passed so that any paramters could be appended to it'''
+        self.W_r = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_r")
+        self.b_wr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wr")
+
+        self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
+        self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
+
+        self.W_h = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_h")
+        self.b_wh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wh")
+
+        self.R_r = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_r")
+        self.b_rr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rr")
+
+        self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
+        self.b_ru = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ru")
+
+        self.R_h = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_h")
+        self.b_rh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rh")
+
+        '''step through processed input to create output'''
+        def step(inp, s_prev):
+            i_t = T.nnet.sigmoid(
+                T.dot(inp, self.W_i) + T.dot(s_prev, self.R_i) + self.b_wi + self.b_ru)
+            r_t = T.nnet.sigmoid(
+                T.dot(inp, self.W_r) + T.dot(s_prev, self.R_r) + self.b_wr + self.b_rr)
+
+            h_hat_t = T.tanh(
+                T.dot(inp, self.W_h) + (r_t * (T.dot(s_prev, self.R_h) + self.b_rh)) + self.b_wh)
+
+            s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
+
+            return s_curr
+
+        outputs_info = self.s0
+
+        states, updates = theano.scan(
+            fn=step,
+            sequences=[self.X],
+            outputs_info=outputs_info
+            )
+
+        self.Y = states
+
+    def output(self):
+        return self.Y
+
+
+class LSTM(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None,
+                 name=""):
+        '''Layers information'''
+        self.name = name
+        self.input_dim = input_dim
+        self.hidden_dim = output_dim
+        self.output_dim = output_dim
+        self.input_layer = input_layer
+        self.X = input_layer.output()
+        self.s0 = s0
+        self.c0 = c0
+        self.params = []
+
+        '''Layers weights'''
+
+        '''self.params is passed so that any paramters could be appended to it'''
+        self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
+        self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
+
+        self.W_f = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_f")
+        self.b_wf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wf")
+
+        self.W_c = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_c")
+        self.b_wc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wc")
+
+        self.W_o = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_o")
+        self.b_wo = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wo")
+
+        self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
+        self.b_ri = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ri")
+
+        self.R_f = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_f")
+        self.b_rf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rf")
+
+        self.R_c = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_c")
+        self.b_rc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rc")
+
+        self.R_o = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_o")
+        self.b_ro = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ro")
+
+        '''step through processed input to create output'''
+        def step(x_t, h_tm1, c_tm1):
+            i_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_i) + T.dot(h_tm1, self.R_i) + self.b_wi + self.b_ri)
+            f_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_f) + T.dot(h_tm1, self.R_f) + self.b_wf + self.b_rf)
+            o_t = T.nnet.sigmoid(
+                T.dot(x_t, self.W_o) + T.dot(h_tm1, self.R_o) + self.b_ro + self.b_wo)
+
+            c_hat_t = T.tanh(
+                T.dot(x_t, self.W_c) + T.dot(h_tm1, self.R_c) + self.b_wc + self.b_rc)
+            c_t = f_t * c_tm1 + i_t * c_hat_t
+            h_t = o_t * T.tanh(c_t)
+
+            return h_t, c_t
+
+        outputs_info = [self.s0, self.c0]
+
+        states, updates = theano.scan(
+            fn=step,
+            sequences=[self.X],
+            outputs_info=outputs_info
+            )
+
+        self.Y = states[0]
+        self.C = states[1]
+
+    def output(self):
+        return self.Y
+
+
+class FC(Layer):
+    def __init__(self, input_dim, output_dim, input_layer, name=""):
+        self.input_layer = input_layer
+        self.name = name
+        self.params = []
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.X = self.input_layer.output()
+
+        self.W = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W")
+        self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
+
+    def output(self):
+        return T.dot(self.X, self.W) + self.b
+
+
+class WrapperLayer(Layer):
+    def __init__(self, X, name=""):
+        self.params = []
+        self.name = name
+        self.X = X
+
+    def output(self):
+        return self.X
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -19,6 +19,7 @@ from ..type import gpuarray_shared_constructor

 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from . import test_nnet
+from .rnn_support import Model, GRU, LSTM, WrapperLayer

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD

@@ -1434,3 +1435,249 @@ def test_batchnorm_inference():
                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
+
+
+def test_dnn_rnn_gru():
+    # test params
+    input_dim = 32
+    hidden_dim = 16
+    batch_size = 2
+    depth = 3
+    timesteps = 5
+
+    # test code
+    X = T.tensor3('X')
+    Y = T.tensor3('Y')
+    h0 = T.tensor3('h0')
+
+    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'gru')
+    psize = rnnb.get_param_size([batch_size, input_dim])
+    params_cudnn = gpuarray_shared_constructor(
+        numpy.zeros((psize,), dtype=theano.config.floatX))
+
+    model = Model()
+    last_layer = WrapperLayer(X)
+    last_dim = input_dim
+    for i in range(depth):
+        gru = GRU(last_dim, hidden_dim, last_layer, s0=h0[i, :, :])
+        model.add_layer(gru)
+        last_layer = gru
+        last_dim = hidden_dim
+        layer_params = gru.get_params()
+        dnn_params = rnnb.split_params(params_cudnn, i,
+                                       [batch_size, input_dim])
+        for j, p in enumerate(dnn_params):
+            p[:] = layer_params[j].get_value(borrow=True,
+                                             return_internal_type=True)
+
+    def funcs(out, params, hy=None):
+        cost = 0
+        if out:
+            cost += T.mean((Y - out)**2)
+        if hy:
+            cost += T.mean(hy**2)
+        grad = T.grad(cost, [X, h0] + params)
+        grad_fn = theano.function([X, Y, h0], grad, mode=mode_with_gpu,
+                                  on_unused_input='ignore')
+        return grad_fn
+
+    ref_y = last_layer.output()
+
+    # This will grab the hy from the scan implementation
+    ref_hy = T.stack([model.layers[0].Y[-1],
+                      model.layers[1].Y[-1],
+                      model.layers[2].Y[-1]])
+
+    y, hy = rnnb.apply(params_cudnn, X, h0)
+
+    ref_fn = theano.function([X, h0], ref_y, mode=mode_with_gpu)
+    cudnn_fn = theano.function([X, h0], y, mode=mode_with_gpu)
+
+    # Test with grad connected to y
+    ref_grad_fn = funcs(ref_y, model.get_params())
+    cudnn_grad_fn = funcs(y, [params_cudnn])
+
+    # Test with grad connected to both y and hy
+    ref2_grad_fn = funcs(ref_y, model.get_params(), ref_hy)
+    cudnn2_grad_fn = funcs(y, [params_cudnn], hy)
+
+    # Test with grad connected to hy
+    ref3_grad_fn = funcs(None, model.get_params(), ref_hy)
+    cudnn3_grad_fn = funcs(None, [params_cudnn], hy)
+
+    ref_grad_fns = [ref_grad_fn, ref2_grad_fn, ref3_grad_fn]
+    cudnn_grad_fns = [cudnn_grad_fn, cudnn2_grad_fn, cudnn3_grad_fn]
+
+    x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
+    y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
+    h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+
+    ref_out = ref_fn(x_val, h0_val)
+    cudnn_out = cudnn_fn(x_val, h0_val)
+
+    utt.assert_allclose(ref_out, cudnn_out)
+
+    for ref_grad_fn, cudnn_grad_fn in zip(ref_grad_fns, cudnn_grad_fns):
+        ref_grads = ref_grad_fn(x_val, y_val, h0_val)
+        cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val)
+
+        utt.assert_allclose(ref_grads[0], cudnn_grads[0])
+        utt.assert_allclose(ref_grads[1], cudnn_grads[1])
+
+        ref_grad_params = ref_grads[2:]
+        cudnn_grad_params = gpuarray_shared_constructor(cudnn_grads[2])
+
+        for i in range(depth):
+            cudnn_grad_layer = rnnb.split_params(cudnn_grad_params, i,
+                                                 [batch_size, input_dim])
+            ref_grad_layer = ref_grad_params[i * len(cudnn_grad_layer):
+                                             (i + 1) * len(cudnn_grad_layer)]
+            for j, g in enumerate(cudnn_grad_layer):
+                utt.assert_allclose(ref_grad_layer[j], g)
+
+
+def test_dnn_rnn_lstm():
+    # test params
+    input_dim = 32
+    hidden_dim = 16
+    batch_size = 2
+    depth = 3
+    timesteps = 5
+
+    # test code
+    X = T.tensor3('X')
+    Y = T.tensor3('Y')
+    h0 = T.tensor3('h0')
+    c0 = T.tensor3('c0')
+
+    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
+    psize = rnnb.get_param_size([batch_size, input_dim])
+    params_cudnn = gpuarray_shared_constructor(
+        numpy.zeros((psize,), dtype=theano.config.floatX))
+
+    model = Model()
+    last_layer = WrapperLayer(X)
+    last_dim = input_dim
+    for i in range(depth):
+        lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
+        model.add_layer(lstm)
+        last_layer = lstm
+        last_dim = hidden_dim
+        layer_params = lstm.get_params()
+        dnn_params = rnnb.split_params(params_cudnn, i,
+                                       [batch_size, input_dim])
+        for j, p in enumerate(dnn_params):
+            p[:] = layer_params[j].get_value(borrow=True,
+                                             return_internal_type=True)
+
+    def funcs(out, params):
+        fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
+        cost = T.mean((Y - out)**2)
+        grad = T.grad(cost, [X, h0, c0] + params)
+        grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
+        return fn, grad_fn
+
+    ref_fn, ref_grad_fn = funcs(last_layer.output(),
+                                model.get_params())
+    cudnn_fn, cudnn_grad_fn = funcs(rnnb.apply(params_cudnn, X, h0, c0)[0],
+                                    [params_cudnn])
+
+    x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
+    y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
+    h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+    c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+
+    ref_out = ref_fn(x_val, h0_val, c0_val)
+    cudnn_out = cudnn_fn(x_val, h0_val, c0_val)
+
+    utt.assert_allclose(ref_out, cudnn_out)
+
+    ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
+    cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)
+
+    utt.assert_allclose(ref_grads[0], cudnn_grads[0])
+    utt.assert_allclose(ref_grads[1], cudnn_grads[1])
+    utt.assert_allclose(ref_grads[2], cudnn_grads[2])
+
+    ref_grads_params = ref_grads[3:]
+    cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
+
+    for i in range(depth):
+        cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
+                                              [batch_size, input_dim])
+        ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
+                                           (i + 1) * len(cudnn_grads_layer)]
+        for j, g in enumerate(cudnn_grads_layer):
+            utt.assert_allclose(ref_grads_layer[j], g)
+
+
+def test_dnn_rnn_lstm_grad_c():
+    # test params
+    input_dim = 32
+    hidden_dim = 16
+    batch_size = 2
+    depth = 3
+    timesteps = 5
+
+    # test code
+    X = T.tensor3('X')
+    CY = T.tensor3('CY')
+    h0 = T.tensor3('h0')
+    c0 = T.tensor3('c0')
+
+    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
+    psize = rnnb.get_param_size([batch_size, input_dim])
+    params_cudnn = gpuarray_shared_constructor(
+        numpy.zeros((psize,), dtype=theano.config.floatX))
+
+    model = Model()
+    last_layer = WrapperLayer(X)
+    last_dim = input_dim
+    for i in range(depth):
+        lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
+        model.add_layer(lstm)
+        last_layer = lstm
+        last_dim = hidden_dim
+        layer_params = lstm.get_params()
+        dnn_params = rnnb.split_params(params_cudnn, i,
+                                       [batch_size, input_dim])
+        for j, p in enumerate(dnn_params):
+            p[:] = layer_params[j].get_value(borrow=True,
+                                             return_internal_type=True)
+
+    def funcs(out, params):
+        cost = T.mean((CY - out)**2)
+        grad = T.grad(cost, [X, h0, c0] + params)
+        grad_fn = theano.function([X, CY, h0, c0], grad, mode=mode_with_gpu)
+        return grad_fn
+
+    _, _, cy = rnnb.apply(params_cudnn, X, h0, c0)
+    ref_cy = T.stack([model.layers[0].C[-1],
+                      model.layers[1].C[-1],
+                      model.layers[2].C[-1]])
+
+    ref_grad_fn = funcs(ref_cy, model.get_params())
+    cudnn_grad_fn = funcs(cy, [params_cudnn])
+
+    x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
+    cy_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+    h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+    c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
+
+    ref_grads = ref_grad_fn(x_val, cy_val, h0_val, c0_val)
+    cudnn_grads = cudnn_grad_fn(x_val, cy_val, h0_val, c0_val)
+
+    utt.assert_allclose(ref_grads[0], cudnn_grads[0])
+    utt.assert_allclose(ref_grads[1], cudnn_grads[1])
+    utt.assert_allclose(ref_grads[2], cudnn_grads[2])
+
+    ref_grads_params = ref_grads[3:]
+    cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
+
+    for i in range(depth):
+        cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
+                                              [batch_size, input_dim])
+        ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
+                                           (i + 1) * len(cudnn_grads_layer)]
+        for j, g in enumerate(cudnn_grads_layer):
+            utt.assert_allclose(ref_grads_layer[j], g)
--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -68,6 +68,7 @@ def reg_context(name, ctx):
    if not isinstance(ctx, gpuarray.GpuContext):
        raise TypeError("context is not GpuContext")
    _context_reg[name] = ctx
+    _props_map[ctx] = dict()


 def get_context(name):
@@ -96,6 +97,26 @@ def list_contexts():
    """
    return _context_reg.keys()

+# Mappings of properties to contexts.  Please never use this if you
+# can avoid it.
+
+# This is basically a way to store "global" variables that depend on
+# the context.
+_props_map = {}
+
+
+def _get_props(name):
+    ctx = get_context(name)
+    return _props_map[ctx]
+
+
+def get_prop(name, k):
+    return _get_props(name)[k]
+
+
+def set_prop(name, k, v):
+    _get_props(name)[k] = v
+

 # Private method
 def _name_for_ctx(ctx):

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx,
                                str(o_shape) + " on an output of shape " +
                                str(g_shape))

-                input_grads = node.op.grad(inputs, new_output_grads)
+                input_grads = node.op.L_op(inputs, node.outputs,
+                                           new_output_grads)

                if input_grads is None:
                    raise TypeError("%s.grad returned NoneType, "