Add ops and support code for the RNN binding for cudnn.

77cc5729 · Arnaud Bergeron · 4b94a811 · 77cc5729 · 77cc5729 · 77cc5729
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
 from __future__ import absolute_import, print_function, division
+import ctypes
 import os
+import sys
 import warnings
 import numpy
@@ -7,7 +9,7 @@ from six import integer_types
 import theano
 from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import as_scalar, constant, Log
+from theano.scalar import as_scalar, constant, Log, get_scalar_type
 from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
@@ -1790,6 +1792,325 @@ class GpuDnnBatchNormGrad(DnnBase):
    def infer_shape(self, node, shape):
        return [shape[0], shape[2], shape[2]]
+gpudata_type = CDataType('gpudata *', 'gpudata_release')
+dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
+                             'cudnnDestroyDropoutDescriptor')
+class GpuDnnDropoutOp(DnnBase):
+    __props__ = ('inplace',)
+    def __init__(self, inplace=False):
+        DnnBase.__init__(self, ["dnn_dropout_fwd.c"], "dnn_dropout_fwd")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {1: [2]}
+    def make_node(self, inp, descriptor, state):
+        ctx_name = infer_context_name(inp)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        return Apply(self, [inp, descriptor, state],
+                     [inp.type(), state.type(), gpudata_type()])
+    def prepare_node(self, node, storage_map, compute_map):
+        assert self.inplace, "GpuDnnDropoutOp not inplace"
+class _DropoutDescriptor(DnnBase):
+    __props__ = ('context_name',)
+    def __init__(self, context_name):
+        DnnBase.__init__(self, ["dnn_dropout_desc.c"], "dnn_dropout_desc")
+        self.context_name = context_name
+    def dnn_context(self, node):
+        return self.context_name
+    def do_constant_folding(self, node):
+        return False
+    def make_node(self, dropout, seed, context_name):
+        dropout = as_scalar(dropout).astype('float32')
+        seed = as_scalar(seed).astype('uint64')
+        assert context_name == self.context_name
+        # This is a dirty hack to pass the context because params is
+        # occupied by the cudnn handle
+        context = gpu_context_type.make_constant(get_context(context_name))
+        return Apply(self, [dropout, seed, context],
+                     [dropoutdesc_type(),
+                      GpuArrayType('uint8', (False,),
+                                   context_name=context_name)()])
+    def c_code_cache_version_apply(self, node):
+        # disable the cache since we can't pickle contexts
+        return None
+def _make_dropout_desc(dropout, seed, context_name):
+    desc, states = theano.function([], _DropoutDescriptor(context_name)(
+        dropout, seed, context_name))()
+    return desc, states
+def dropout(x, dropout=0.0, seed=4242):
+    desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
+    y, odesc = GpuDnnDropoutOp()(x, desc)
+    return y, desc, odesc, states
+rnndesc_type = CDataType('cudnnRNNDescriptor_t',
+                         'cudnnDestroyRNNDescriptor')
+def as_i32(v):
+    return as_scalar(v).astype('int32')
+class _RNNDescriptor(DnnBase):
+    __props__ = ('context_name',)
+    def __init__(self, context_name):
+        DnnBase.__init__(self, ["dnn_rnn_desc.c"], "dnn_rnn_desc")
+        self.context_name = context_name
+    def dnn_context(self, node):
+        return self.context_name
+    def do_constant_folding(self, node):
+        return False
+    def make_node(self, hidden_size, num_layers, ddesc, input_mode,
+                  direction_mode, rnn_mode, dtype):
+        hidden_size = as_i32(hidden_size)
+        num_layers = as_i32(num_layers)
+        assert 5000 < version() < 5200, "Constants only work for cudnn 5, 5.1"
+        if input_mode == 'linear':
+            input_mode = as_i32(0)
+        elif input_mode == 'skip':
+            input_mode = as_i32(1)
+        else:
+            raise ValueError("input_mode")
+        if direction_mode == 'unidirectional':
+            direction_mode = as_i32(0)
+        elif direction_mode == 'bidirectional':
+            direction_mode = as_i32(1)
+        else:
+            raise ValueError("direction_mode")
+        if rnn_mode == 'rnn_relu':
+            rnn_mode = as_i32(0)
+        elif rnn_mode == 'rnn_tanh':
+            rnn_mode = as_i32(1)
+        elif rnn_mode == 'lstm':
+            rnn_mode = as_i32(2)
+        elif rnn_mode == 'gru':
+            rnn_mode = as_i32(3)
+        else:
+            raise ValueError("rnn_mode")
+        dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
+        return Apply(self, [hidden_size, num_layers,
+                            dropoutdesc_type.make_constant(ddesc),
+                            input_mode, direction_mode, rnn_mode, dtype],
+                     [rnndesc_type()])
+def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
+                   input_mode, direction_mode, dtype, context_name):
+    desc = theano.function([], _RNNDescriptor(context_name)(
+        hidden_size, num_layers, ddesc, input_mode, direction_mode,
+        rnn_mode, dtype))()
+    return desc
+class _RNNParamSize(DnnBase):
+    __props__ = ('context_name',)
+    def __init__(self, context_name):
+        DnnBase.__init__(self, ["dnn_rnn_paramsize.c"],
+                         "dnn_rnn_paramsize")
+        self.context_name = context_name
+    def dnn_context(self, node):
+        return self.context_name
+    def do_constant_folding(self, node):
+        return False
+    def make_node(self, desc, input_size, typecode):
+        input_size = as_tensor_variable(input_size).astype('uint64')
+        typecode = as_i32(typecode)
+        return Apply(self, [rnndesc_type.make_constant(desc), input_size,
+                            typecode],
+                     [get_scalar_type('uint64')()])
+def _get_param_size(desc, input_size, dtype, context_name):
+    typecode = gpuarray.dtype_to_typecode(dtype)
+    return theano.function([], _RNNParamSize(context_name)(
+            desc, input_size, typecode))()
+class GpuDnnRNNOp(DnnBase):
+    __props__ = ()
+    _cop_numi = 5
+    _cop_numo = 4
+    def __init__(self, rnn_mode, direction_mode):
+        DnnBase.__init__(self, ["dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
+        self.rnn_mode = rnn_mode
+        if direction_mode == 'bidirectional':
+            self.num_dirs = 2
+        elif direction_mode == 'unidirectional':
+            self.num_dirs = 1
+        else:
+            raise ValueError('direction_mode')
+    def dnn_context(self, node):
+        return node.outputs[1].type.context_name
+    def make_node(self, desc, w, x, hx, cx=None):
+        if cx is None:
+            context_name = infer_context_name(w, x, hx, cx)
+        else:
+            context_name = infer_context_name(w, x, hx)
+        w = as_gpuarray_variable(w, context_name)
+        x = as_gpuarray_variable(x, context_name)
+        hx = as_gpuarray_variable(hx, context_name)
+        inputs = [desc, w, x, hx]
+        assert w.ndim == 1
+        assert x.ndim == 3 # seqLength, minibatch, inputSize
+        assert hx.ndim == 3 # numLayers * bidi, minibatch, hiddenSize
+        if self.rnn_mode == 'lstm':
+            cx = as_gpuarray_variable(cx, context_name)
+            assert cx.ndim == 3 # numLayers * bidi, minibatch, hiddenSize
+            inputs.append(cx)
+        _3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
+                           context_name=context_name)
+        reserve = gpudata_type()
+        y = _3d() # seqLength, minibatch, hiddenSize * bidi
+        hy = _3d() # numLayers * bidi, miniBatch, hiddenSize
+        outputs = [reserve, y, hy]
+        if self.rnn_mode == 'lstm':
+            cy = _3d() # numLayers * bidi, miniBatch, hiddenSize
+            outputs.append(cy)
+        return Apply(self, inputs, outputs)
+    def grad2(self, inputs, outputs, output_grads):
+        desc, w, x, hx = inputs[:4]
+        cx = inputs[4] if len(inputs) == 5 else None
+        reserve, y, hy = outputs[:3]
+        _, dy, dhy = output_grads[:3]
+        dcy = output_grads[3] if len(output_grads) == 4 else None
+        dinputs = GpuDnnRNNGradInputs()(
+            desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
+        reserve2, dx, dhx = dinputs[:3]
+        dw = GpuDnnRNNGradWeights()(
+            desc, x, hx, y, reserve2, w)
+        res = [DisconnectedType()(), dw, dx, dhx]
+        if cx is not None:
+            res.append(dinputs[3])  # dcx
+        return res
+    def connection_pattern(self, node):
+        deconn = [[False] * len(node.outputs)]
+        conn = [[True] * len(node.outputs)] * (len(node.inputs) - 1)
+        return deconn + conn
+class GpuDnnRNNGradInputs(DnnBase):
+    __props__ = ()
+    _cop_numi = 10
+    _cop_numo = 4
+    def __init__(self):
+        DnnBase.__init__(self, ['dnn_rnn_gi.c'], 'dnn_rnn_gi')
+    def dnn_context(self, node):
+        return node.outputs[1].type.context_name
+    def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
+        # We trust the callers here
+        xshp = as_scalar(x.shape[2]).astype('uint64')
+        inputs = [desc, xshp, y, dy, dhy, w, hx, reserve]
+        outputs = [reserve.type(), x.type(), hx.type()]
+        if dcy is not None:
+            inputs.append(dcy)
+            inputs.append(cx)
+            outputs.append(cx.type())
+        return Apply(self, inputs, outputs)
+class GpuDnnRNNGradWeights(DnnBase):
+    __props__ = ()
+    def __init__(self):
+        DnnBase.__init__(self, ['dnn_rnn_gw.c'], 'dnn_rnn_gw')
+    def make_node(self, desc, x, hx, y, reserve, w):
+        # We trust the callers here
+        wsize = as_scalar(w.shape[0]).astype('uint64')
+        inputs = [desc, wsize, x, hx, y, reserve]
+        outputs = [w.type()]
+        return Apply(self, inputs, outputs)
+class RNNBlock(object):
+    def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
+                 input_mode='linear', direction_mode='unidirectional',
+                 dropout=0.0, dropout_seed=4242, context_name=None):
+        """
+        dtype: data type of computation
+        hidden_size: int
+        num_layers: int
+        rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
+          See cudnn documentation for cudnnRNNMode_t.
+        input_mode: {'linear', 'skip'}
+          linear: input will be multiplied by a biased matrix
+          skip: No operation is performed on the input.  The size must match the hidden size.
+        direction_mode: {'unidirectional', 'bidirectional'}
+          unidirectional: The network operates recurrently from the
+                          first input to the last.
+          bidirectional: The operates from first to last then from last to first and concatenates the results at each layer.
+        """
+        ddesc, states = _make_dropout_desc(dropout, dropout_seed, context_name)
+        self.ddesc = ddesc
+        self.dstates = states
+        self.desc = _make_rnn_desc(hidden_size, num_layers,
+                                   ddesc, rnn_mode, input_mode,
+                                   direction_mode, dtype, context_name)
+        self.rnn_mode = rnn_mode
+        self.direction_mode = direction_mode
+        self.context_name = context_name
+        self.dtype = dtype
+    def get_param_size(self, input_size):
+        bytesize = _get_param_size(self.desc, input_size, self.dtype,
+                                   self.context_name)
+        bytesize = int(bytesize)
+        assert bytesize % numpy.dtype(self.dtype).itemsize == 0
+        return bytesize // numpy.dtype(self.dtype).itemsize
+    def apply(self, w, x, hx, cx=None):
+        # Don't return the reserve as an output
+        return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
+            rnndesc_type.make_constant(self.desc),
+            w, x, hx, cx, return_list=True)[1:]
 def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
                                  epsilon=1e-4):

--- a/theano/gpuarray/dnn_dropout_desc.c
+++ b/theano/gpuarray/dnn_dropout_desc.c
+#section support_code
+int dnn_dropout_desc(float dropout, unsigned long long seed,
+                     PyGpuContextObject *c,
+                     cudnnDropoutDescriptor_t *odesc,
+                     PyGpuArrayObject **ostates,
+                     cudnnHandle_t _handle) {
+  PyGpuArrayObject *states;
+  cudnnDropoutDescriptor_t desc;
+  size_t states_sz;
+  cudnnStatus_t err;
+  cuda_enter(c->ctx);
+  err = cudnnCreateDropoutDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  /* Can't fail according to docs */
+  cudnnDropoutGetStatesSize(_handle, &states_sz);
+  states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
+  if (states == NULL) {
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
+                                  PyGpuArray_DEV_DATA(states),
+                                  states_sz, seed);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
+    Py_DECREF((PyObject *)states);
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  *odesc = desc;
+  *ostates = states;
+  return 0;
+}
--- a/theano/gpuarray/dnn_dropout_fwd.c
+++ b/theano/gpuarray/dnn_dropout_fwd.c
+#section support_code
+int dnn_dropout_fwd(PyGpuArrayObject *x,
+                    cudnnDropoutDescriptor_t *desc,
+                    PyGpuArrayObject *state,
+                    PyGpuArrayObject **y,
+                    PyGpuArrayObject **ostate,
+                    gpudata **reserve,
+                    cudnnHandle_t _handle) {
+  PyGpuArrayContext *c = x->context;
+  cudnnTensorDescriptor_t xdesc;
+  cudnnTensorDescriptor_t ydesc;
+  gpudata *res;
+  size_t res_sz;
+  cudnnStatus_t err;
+  if (c_make_tensorNd(x, &xdesc))
+    return -1;
+  if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
+                         GA_C_ORDER, c)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  if (c_make_tensorNd(y, &ydesc)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  *ostate = state;
+  Py_INCREF((PyObject *)state);
+  /* This can't fail according to the docs */
+  err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
+  res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
+  if (res == NULL) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyTensorDescriptor(ydesc);
+    PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
+  }
+  *reserve = res;
+  cuda_enter(c->ctx);
+  err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
+                            ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
+                            res_sz);
+  cudnnDestroyTensorDescriptor(xdesc);
+  cudnnDestroyTensorDescriptor(ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not run dropout: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_desc.c
+++ b/theano/gpuarray/dnn_rnn_desc.c
+#section support_code
+int dnn_rnn_desc(int hidden_size, int num_layers,
+                 cudnnDropoutDescriptor_t ddesc,
+                 int input_mode, int direction_mode, int rnn_mode,
+                 int dtype, cudnnRNNDescriptor_t *odesc,
+                 cudnnHandle_t _handle) {
+  cudnnRNNDescriptor_t desc;
+  cudnnDataType_t data_type;
+  cudnnStatus_t err;
+  switch (dtype) {
+  case GA_FLOAT:
+    data_type = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    data_type = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    data_type = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateRNNDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
+    return -1;
+  }
+  err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
+                              (cudnnRNNInputMode_t)input_mode,
+                              (cudnnDirectionMode_t)direction_mode,
+                              (cudnnRNNMode_t)rnn_mode, data_type);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyRNNDescriptor(desc);
+    PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
+    return -1;
+  }
+  *odesc = desc;
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_fwd.c
+++ b/theano/gpuarray/dnn_rnn_fwd.c
+#section support_code
+int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
+                PyGpuArrayObject *w, PyGpuArrayObject *x,
+                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
+                gpudata **reserve, PyGpuArrayObject **y,
+                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
+                cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t hydesc = NULL;
+  cudnnTensorDescriptor_t cydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(x, 0);
+  size_t miniBatch = PyGpuArray_DIM(x, 1);
+  size_t inputSize = PyGpuArray_DIM(x, 2);
+  size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
+  size_t shape[3];
+  int strs[3], dims[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[1] * dims[2];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = hiddenSizeDir;
+  if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
+                         hx->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*hy, &hydesc) != 0)
+    goto fail;
+  if (cy != NULL) {
+    if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
+                           cx->ga.typecode, GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*cy, &cydesc) != 0)
+      goto fail;
+  }
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*reserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                                wdesc, PyGpuArray_DEV_DATA(w),
+                                yl, PyGpuArray_DEV_DATA(*y),
+                                hydesc, PyGpuArray_DEV_DATA(*hy),
+                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
+                                *(void **)workspace, worksize,
+                                *(void **)(*reserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+ fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (hydesc != NULL)
+    cudnnDestroyTensorDescriptor(hydesc);
+  if (cydesc != NULL)
+    cudnnDestroyTensorDescriptor(cydesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gi.c
+++ b/theano/gpuarray/dnn_rnn_gi.c
+#section support_code
+int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
+               PyGpuArrayObject *y, PyGpuArrayObject *dy,
+               PyGpuArrayObject *dhy, PyGpuArrayObject *w,
+               PyGpuArrayObject *hx, gpudata *reserve, PyGpuArrayObject *dcy,
+               PyGpuArrayObject *cx, gpudata **oreserve,
+               PyGpuArrayObject **dx, PyGpuArrayObject **dhx,
+               PyGpuArrayObject **dcx, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = y->context;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t dhydesc = NULL;
+  cudnnTensorDescriptor_t dcydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t dxdesc = NULL;
+  cudnnTensorDescriptor_t dhxdesc = NULL;
+  cudnnTensorDescriptor_t dcxdesc = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  cudnnTensorDescriptor_t *dxl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(y, 0);
+  size_t miniBatch = PyGpuArray_DIM(y, 1);
+  size_t inputSize = xshp;
+  size_t shape[3];
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (y->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
+    return -1;
+  }
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(dhy, &dhydesc) != 0)
+    goto fail;
+  if (dcy != NULL)
+    if (c_make_tensorNd(dcy, &dcydesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = inputSize;
+  if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&dxdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
+    goto fail;
+  if (cx != NULL) {
+    if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
+                           GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
+      goto fail;
+  }
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (dxl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    dxl[i] = dxdesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       dxl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*oreserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
+    goto fail;
+  }
+  err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
+                             yl, PyGpuArray_DEV_DATA(y),
+                             /* y and dy are the same shape */
+                             yl, PyGpuArray_DEV_DATA(dy),
+                             dhydesc, PyGpuArray_DEV_DATA(dhy),
+                             dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
+                             wdesc, PyGpuArray_DEV_DATA(w),
+                             hxdesc, PyGpuArray_DEV_DATA(hx),
+                             cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                             dxl, PyGpuArray_DEV_DATA(*dx),
+                             dhxdesc, PyGpuArray_DEV_DATA(*dhx),
+                             dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
+                             *(void **)workspace, worksize,
+                             *(void **)(*oreserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad inputs: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dhydesc != NULL)
+    cudnnDestroyTensorDescriptor(dhydesc);
+  if (dcydesc != NULL)
+    cudnnDestroyTensorDescriptor(dcydesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (dxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dxdesc);
+  if (dhxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dhxdesc);
+  if (dcxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dcxdesc);
+  free(yl);
+  free(dxl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gw.c
+++ b/theano/gpuarray/dnn_rnn_gw.c
+#section support_code
+int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
+               PyGpuArrayObject *x, PyGpuArrayObject *hx,
+               PyGpuArrayObject *y, gpudata *reserve,
+               PyGpuArrayObject **dw, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnFilterDescriptor_t dwdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t iters = PyGpuArray_DIM(x, 0);
+  size_t wsize = _wsize;
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_filter(*dw, &dwdesc) != 0)
+    goto fail;
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                yl, PyGpuArray_DEV_DATA(y),
+                                *(void **)workspace, worksize,
+                                dwdesc, PyGpuArray_DEV_DATA(*dw),
+                                *(void **)reserve, ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad weights: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dwdesc != NULL)
+    cudnnDestroyFilterDescriptor(dwdesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_paramsize.c
+++ b/theano/gpuarray/dnn_rnn_paramsize.c
+#section support_code
+int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
+                      PyArrayObject *isize,
+                      npy_int32 typecode,
+                      npy_uint64 *oparam_size,
+                      cudnnHandle_t _handle) {
+  cudnnTensorDescriptor_t xdesc;
+  size_t param_size;
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int shape[3];
+  int strides[3];
+  if (PyArray_DIM(isize, 0) != 2) {
+    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
+    return -1;
+  }
+  switch (typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
+    return -1;
+  }
+  shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
+  shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
+  shape[2] = 1;
+  strides[0] = shape[2] * shape[1];
+  strides[1] = shape[2];
+  strides[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+  err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
+    return -1;
+  }
+  cudnnDestroyTensorDescriptor(xdesc);
+  *oparam_size = param_size;
+  return 0;
+}