Add ops and support code for the RNN binding for cudnn.

77cc5729 · Arnaud Bergeron · 4b94a811 · 77cc5729 · 77cc5729 · 77cc5729
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_dropout_desc.c
+++ b/theano/gpuarray/dnn_dropout_desc.c
+#section support_code
+int dnn_dropout_desc(float dropout, unsigned long long seed,
+                     PyGpuContextObject *c,
+                     cudnnDropoutDescriptor_t *odesc,
+                     PyGpuArrayObject **ostates,
+                     cudnnHandle_t _handle) {
+  PyGpuArrayObject *states;
+  cudnnDropoutDescriptor_t desc;
+  size_t states_sz;
+  cudnnStatus_t err;
+  cuda_enter(c->ctx);
+  err = cudnnCreateDropoutDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  /* Can't fail according to docs */
+  cudnnDropoutGetStatesSize(_handle, &states_sz);
+  states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
+  if (states == NULL) {
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
+                                  PyGpuArray_DEV_DATA(states),
+                                  states_sz, seed);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
+    Py_DECREF((PyObject *)states);
+    cudnnDestroyDropoutDescriptor(desc);
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  *odesc = desc;
+  *ostates = states;
+  return 0;
+}
--- a/theano/gpuarray/dnn_dropout_fwd.c
+++ b/theano/gpuarray/dnn_dropout_fwd.c
+#section support_code
+int dnn_dropout_fwd(PyGpuArrayObject *x,
+                    cudnnDropoutDescriptor_t *desc,
+                    PyGpuArrayObject *state,
+                    PyGpuArrayObject **y,
+                    PyGpuArrayObject **ostate,
+                    gpudata **reserve,
+                    cudnnHandle_t _handle) {
+  PyGpuArrayContext *c = x->context;
+  cudnnTensorDescriptor_t xdesc;
+  cudnnTensorDescriptor_t ydesc;
+  gpudata *res;
+  size_t res_sz;
+  cudnnStatus_t err;
+  if (c_make_tensorNd(x, &xdesc))
+    return -1;
+  if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
+                         GA_C_ORDER, c)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  if (c_make_tensorNd(y, &ydesc)) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    return -1;
+  }
+  *ostate = state;
+  Py_INCREF((PyObject *)state);
+  /* This can't fail according to the docs */
+  err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
+  res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
+  if (res == NULL) {
+    cudnnDestroyTensorDescriptor(xdesc);
+    cudnnDestroyTensorDescriptor(ydesc);
+    PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
+  }
+  *reserve = res;
+  cuda_enter(c->ctx);
+  err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
+                            ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
+                            res_sz);
+  cudnnDestroyTensorDescriptor(xdesc);
+  cudnnDestroyTensorDescriptor(ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not run dropout: %s",
+                 cudnnGetErrorString(err));
+    cuda_exit(c->ctx);
+    return -1;
+  }
+  cuda_exit(c->ctx);
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_desc.c
+++ b/theano/gpuarray/dnn_rnn_desc.c
+#section support_code
+int dnn_rnn_desc(int hidden_size, int num_layers,
+                 cudnnDropoutDescriptor_t ddesc,
+                 int input_mode, int direction_mode, int rnn_mode,
+                 int dtype, cudnnRNNDescriptor_t *odesc,
+                 cudnnHandle_t _handle) {
+  cudnnRNNDescriptor_t desc;
+  cudnnDataType_t data_type;
+  cudnnStatus_t err;
+  switch (dtype) {
+  case GA_FLOAT:
+    data_type = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    data_type = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    data_type = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateRNNDescriptor(&desc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
+    return -1;
+  }
+  err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
+                              (cudnnRNNInputMode_t)input_mode,
+                              (cudnnDirectionMode_t)direction_mode,
+                              (cudnnRNNMode_t)rnn_mode, data_type);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    cudnnDestroyRNNDescriptor(desc);
+    PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
+    return -1;
+  }
+  *odesc = desc;
+  return 0;
+}
--- a/theano/gpuarray/dnn_rnn_fwd.c
+++ b/theano/gpuarray/dnn_rnn_fwd.c
+#section support_code
+int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
+                PyGpuArrayObject *w, PyGpuArrayObject *x,
+                PyGpuArrayObject *hx, PyGpuArrayObject *cx,
+                gpudata **reserve, PyGpuArrayObject **y,
+                PyGpuArrayObject **hy, PyGpuArrayObject **cy,
+                cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t hydesc = NULL;
+  cudnnTensorDescriptor_t cydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(x, 0);
+  size_t miniBatch = PyGpuArray_DIM(x, 1);
+  size_t inputSize = PyGpuArray_DIM(x, 2);
+  size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
+  size_t shape[3];
+  int strs[3], dims[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[1] * dims[2];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = hiddenSizeDir;
+  if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
+                         hx->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*hy, &hydesc) != 0)
+    goto fail;
+  if (cy != NULL) {
+    if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
+                           cx->ga.typecode, GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*cy, &cydesc) != 0)
+      goto fail;
+  }
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*reserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                                wdesc, PyGpuArray_DEV_DATA(w),
+                                yl, PyGpuArray_DEV_DATA(*y),
+                                hydesc, PyGpuArray_DEV_DATA(*hy),
+                                cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
+                                *(void **)workspace, worksize,
+                                *(void **)(*reserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+ fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (hydesc != NULL)
+    cudnnDestroyTensorDescriptor(hydesc);
+  if (cydesc != NULL)
+    cudnnDestroyTensorDescriptor(cydesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gi.c
+++ b/theano/gpuarray/dnn_rnn_gi.c
+#section support_code
+int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
+               PyGpuArrayObject *y, PyGpuArrayObject *dy,
+               PyGpuArrayObject *dhy, PyGpuArrayObject *w,
+               PyGpuArrayObject *hx, gpudata *reserve, PyGpuArrayObject *dcy,
+               PyGpuArrayObject *cx, gpudata **oreserve,
+               PyGpuArrayObject **dx, PyGpuArrayObject **dhx,
+               PyGpuArrayObject **dcx, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = y->context;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnTensorDescriptor_t dhydesc = NULL;
+  cudnnTensorDescriptor_t dcydesc = NULL;
+  cudnnFilterDescriptor_t wdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t cxdesc = NULL;
+  cudnnTensorDescriptor_t dxdesc = NULL;
+  cudnnTensorDescriptor_t dhxdesc = NULL;
+  cudnnTensorDescriptor_t dcxdesc = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  cudnnTensorDescriptor_t *dxl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t seqLength = PyGpuArray_DIM(y, 0);
+  size_t miniBatch = PyGpuArray_DIM(y, 1);
+  size_t inputSize = xshp;
+  size_t shape[3];
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (y->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
+    return -1;
+  }
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(dhy, &dhydesc) != 0)
+    goto fail;
+  if (dcy != NULL)
+    if (c_make_tensorNd(dcy, &dcydesc) != 0)
+      goto fail;
+  if (c_make_filter(w, &wdesc) != 0)
+    goto fail;
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  if (cx != NULL)
+    if (c_make_tensorNd(cx, &cxdesc) != 0)
+      goto fail;
+  shape[0] = seqLength;
+  shape[1] = miniBatch;
+  shape[2] = inputSize;
+  if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&dxdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = shape[1];
+  dims[1] = shape[2];
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set dxdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
+    goto fail;
+  if (cx != NULL) {
+    if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
+                           GA_C_ORDER, c) != 0)
+      goto fail;
+    if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
+      goto fail;
+  }
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    yl[i] = ydesc;
+  dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
+  if (dxl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < seqLength; i++)
+    dxl[i] = dxdesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
+                                       dxl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  *oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
+  if (*oreserve == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
+    goto fail;
+  }
+  if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
+    goto fail;
+  }
+  err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
+                             yl, PyGpuArray_DEV_DATA(y),
+                             /* y and dy are the same shape */
+                             yl, PyGpuArray_DEV_DATA(dy),
+                             dhydesc, PyGpuArray_DEV_DATA(dhy),
+                             dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
+                             wdesc, PyGpuArray_DEV_DATA(w),
+                             hxdesc, PyGpuArray_DEV_DATA(hx),
+                             cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
+                             dxl, PyGpuArray_DEV_DATA(*dx),
+                             dhxdesc, PyGpuArray_DEV_DATA(*dhx),
+                             dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
+                             *(void **)workspace, worksize,
+                             *(void **)(*oreserve), ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad inputs: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dhydesc != NULL)
+    cudnnDestroyTensorDescriptor(dhydesc);
+  if (dcydesc != NULL)
+    cudnnDestroyTensorDescriptor(dcydesc);
+  if (wdesc != NULL)
+    cudnnDestroyFilterDescriptor(wdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (cxdesc != NULL)
+    cudnnDestroyTensorDescriptor(cxdesc);
+  if (dxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dxdesc);
+  if (dhxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dhxdesc);
+  if (dcxdesc != NULL)
+    cudnnDestroyTensorDescriptor(dcxdesc);
+  free(yl);
+  free(dxl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_gw.c
+++ b/theano/gpuarray/dnn_rnn_gw.c
+#section support_code
+int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
+               PyGpuArrayObject *x, PyGpuArrayObject *hx,
+               PyGpuArrayObject *y, gpudata *reserve,
+               PyGpuArrayObject **dw, cudnnHandle_t _handle) {
+  PyGpuContextObject *c = x->context;
+  cudnnTensorDescriptor_t xdesc = NULL;
+  cudnnTensorDescriptor_t hxdesc = NULL;
+  cudnnTensorDescriptor_t ydesc = NULL;
+  cudnnFilterDescriptor_t dwdesc = NULL;
+  cudnnTensorDescriptor_t *xl = NULL;
+  cudnnTensorDescriptor_t *yl = NULL;
+  gpudata *workspace = NULL;
+  size_t worksize, ressize;
+  size_t iters = PyGpuArray_DIM(x, 0);
+  size_t wsize = _wsize;
+  int dims[3], strs[3];
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int res = -1;
+  switch (x->ga.typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* We need to use the last two dimensions for this, this is not a typo */
+  dims[0] = PyGpuArray_DIM(x, 1);
+  dims[1] = PyGpuArray_DIM(x, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set xdesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (c_make_tensorNd(hx, &hxdesc) != 0)
+    goto fail;
+  err = cudnnCreateTensorDescriptor(&ydesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not create ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  /* Again not a typo, we need to use the last two dimensions */
+  dims[0] = PyGpuArray_DIM(y, 1);
+  dims[1] = PyGpuArray_DIM(y, 2);
+  dims[2] = 1;
+  strs[0] = dims[2] * dims[1];
+  strs[1] = dims[2];
+  strs[2] = 1;
+  err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not set ydesc: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
+    goto fail;
+  if (c_make_filter(*dw, &dwdesc) != 0)
+    goto fail;
+  xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (xl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    xl[i] = xdesc;
+  yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
+  if (yl == NULL) {
+    PyErr_NoMemory();
+    goto fail;
+  }
+  for (size_t i = 0; i < iters; i++)
+    yl[i] = ydesc;
+  err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
+                                 xl, &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get worksize: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
+  if (workspace == NULL) {
+    PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
+    goto fail;
+  }
+  err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
+                                       xl, &ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could not get reserve size: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
+                                xl, PyGpuArray_DEV_DATA(x),
+                                hxdesc, PyGpuArray_DEV_DATA(hx),
+                                yl, PyGpuArray_DEV_DATA(y),
+                                *(void **)workspace, worksize,
+                                dwdesc, PyGpuArray_DEV_DATA(*dw),
+                                *(void **)reserve, ressize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError,
+                 "Could run RNN grad weights: %s",
+                 cudnnGetErrorString(err));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (xdesc != NULL)
+    cudnnDestroyTensorDescriptor(xdesc);
+  if (hxdesc != NULL)
+    cudnnDestroyTensorDescriptor(hxdesc);
+  if (ydesc != NULL)
+    cudnnDestroyTensorDescriptor(ydesc);
+  if (dwdesc != NULL)
+    cudnnDestroyFilterDescriptor(dwdesc);
+  free(xl);
+  free(yl);
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/dnn_rnn_paramsize.c
+++ b/theano/gpuarray/dnn_rnn_paramsize.c
+#section support_code
+int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
+                      PyArrayObject *isize,
+                      npy_int32 typecode,
+                      npy_uint64 *oparam_size,
+                      cudnnHandle_t _handle) {
+  cudnnTensorDescriptor_t xdesc;
+  size_t param_size;
+  cudnnStatus_t err;
+  cudnnDataType_t dt;
+  int shape[3];
+  int strides[3];
+  if (PyArray_DIM(isize, 0) != 2) {
+    PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
+    return -1;
+  }
+  switch (typecode) {
+  case GA_FLOAT:
+    dt = CUDNN_DATA_FLOAT;
+    break;
+  case GA_DOUBLE:
+    dt = CUDNN_DATA_DOUBLE;
+    break;
+  case GA_HALF:
+    dt = CUDNN_DATA_HALF;
+    break;
+  default:
+    PyErr_SetString(PyExc_ValueError, "Unsupported data type");
+    return -1;
+  }
+  err = cudnnCreateTensorDescriptor(&xdesc);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
+    return -1;
+  }
+  shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
+  shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
+  shape[2] = 1;
+  strides[0] = shape[2] * shape[1];
+  strides[1] = shape[2];
+  strides[2] = 1;
+  err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
+                 cudnnGetErrorString(err));
+    return -1;
+  }
+  err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
+    return -1;
+  }
+  cudnnDestroyTensorDescriptor(xdesc);
+  *oparam_size = param_size;
+  return 0;
+}