提交 77cc5729 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add ops and support code for the RNN binding for cudnn.

上级 4b94a811
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import ctypes
import os import os
import sys
import warnings import warnings
import numpy import numpy
...@@ -7,7 +9,7 @@ from six import integer_types ...@@ -7,7 +9,7 @@ from six import integer_types
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log from theano.scalar import as_scalar, constant, Log, get_scalar_type
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
...@@ -1790,6 +1792,325 @@ class GpuDnnBatchNormGrad(DnnBase): ...@@ -1790,6 +1792,325 @@ class GpuDnnBatchNormGrad(DnnBase):
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0], shape[2], shape[2]] return [shape[0], shape[2], shape[2]]
gpudata_type = CDataType('gpudata *', 'gpudata_release')
dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
'cudnnDestroyDropoutDescriptor')
class GpuDnnDropoutOp(DnnBase):
__props__ = ('inplace',)
def __init__(self, inplace=False):
DnnBase.__init__(self, ["dnn_dropout_fwd.c"], "dnn_dropout_fwd")
self.inplace = inplace
if self.inplace:
self.destroy_map = {1: [2]}
def make_node(self, inp, descriptor, state):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
return Apply(self, [inp, descriptor, state],
[inp.type(), state.type(), gpudata_type()])
def prepare_node(self, node, storage_map, compute_map):
assert self.inplace, "GpuDnnDropoutOp not inplace"
class _DropoutDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_dropout_desc.c"], "dnn_dropout_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, dropout, seed, context_name):
dropout = as_scalar(dropout).astype('float32')
seed = as_scalar(seed).astype('uint64')
assert context_name == self.context_name
# This is a dirty hack to pass the context because params is
# occupied by the cudnn handle
context = gpu_context_type.make_constant(get_context(context_name))
return Apply(self, [dropout, seed, context],
[dropoutdesc_type(),
GpuArrayType('uint8', (False,),
context_name=context_name)()])
def c_code_cache_version_apply(self, node):
# disable the cache since we can't pickle contexts
return None
def _make_dropout_desc(dropout, seed, context_name):
desc, states = theano.function([], _DropoutDescriptor(context_name)(
dropout, seed, context_name))()
return desc, states
def dropout(x, dropout=0.0, seed=4242):
desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
y, odesc = GpuDnnDropoutOp()(x, desc)
return y, desc, odesc, states
rnndesc_type = CDataType('cudnnRNNDescriptor_t',
'cudnnDestroyRNNDescriptor')
def as_i32(v):
return as_scalar(v).astype('int32')
class _RNNDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_rnn_desc.c"], "dnn_rnn_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, hidden_size, num_layers, ddesc, input_mode,
direction_mode, rnn_mode, dtype):
hidden_size = as_i32(hidden_size)
num_layers = as_i32(num_layers)
assert 5000 < version() < 5200, "Constants only work for cudnn 5, 5.1"
if input_mode == 'linear':
input_mode = as_i32(0)
elif input_mode == 'skip':
input_mode = as_i32(1)
else:
raise ValueError("input_mode")
if direction_mode == 'unidirectional':
direction_mode = as_i32(0)
elif direction_mode == 'bidirectional':
direction_mode = as_i32(1)
else:
raise ValueError("direction_mode")
if rnn_mode == 'rnn_relu':
rnn_mode = as_i32(0)
elif rnn_mode == 'rnn_tanh':
rnn_mode = as_i32(1)
elif rnn_mode == 'lstm':
rnn_mode = as_i32(2)
elif rnn_mode == 'gru':
rnn_mode = as_i32(3)
else:
raise ValueError("rnn_mode")
dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
return Apply(self, [hidden_size, num_layers,
dropoutdesc_type.make_constant(ddesc),
input_mode, direction_mode, rnn_mode, dtype],
[rnndesc_type()])
def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
input_mode, direction_mode, dtype, context_name):
desc = theano.function([], _RNNDescriptor(context_name)(
hidden_size, num_layers, ddesc, input_mode, direction_mode,
rnn_mode, dtype))()
return desc
class _RNNParamSize(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_rnn_paramsize.c"],
"dnn_rnn_paramsize")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, desc, input_size, typecode):
input_size = as_tensor_variable(input_size).astype('uint64')
typecode = as_i32(typecode)
return Apply(self, [rnndesc_type.make_constant(desc), input_size,
typecode],
[get_scalar_type('uint64')()])
def _get_param_size(desc, input_size, dtype, context_name):
typecode = gpuarray.dtype_to_typecode(dtype)
return theano.function([], _RNNParamSize(context_name)(
desc, input_size, typecode))()
class GpuDnnRNNOp(DnnBase):
__props__ = ()
_cop_numi = 5
_cop_numo = 4
def __init__(self, rnn_mode, direction_mode):
DnnBase.__init__(self, ["dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
self.rnn_mode = rnn_mode
if direction_mode == 'bidirectional':
self.num_dirs = 2
elif direction_mode == 'unidirectional':
self.num_dirs = 1
else:
raise ValueError('direction_mode')
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, w, x, hx, cx=None):
if cx is None:
context_name = infer_context_name(w, x, hx, cx)
else:
context_name = infer_context_name(w, x, hx)
w = as_gpuarray_variable(w, context_name)
x = as_gpuarray_variable(x, context_name)
hx = as_gpuarray_variable(hx, context_name)
inputs = [desc, w, x, hx]
assert w.ndim == 1
assert x.ndim == 3 # seqLength, minibatch, inputSize
assert hx.ndim == 3 # numLayers * bidi, minibatch, hiddenSize
if self.rnn_mode == 'lstm':
cx = as_gpuarray_variable(cx, context_name)
assert cx.ndim == 3 # numLayers * bidi, minibatch, hiddenSize
inputs.append(cx)
_3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
context_name=context_name)
reserve = gpudata_type()
y = _3d() # seqLength, minibatch, hiddenSize * bidi
hy = _3d() # numLayers * bidi, miniBatch, hiddenSize
outputs = [reserve, y, hy]
if self.rnn_mode == 'lstm':
cy = _3d() # numLayers * bidi, miniBatch, hiddenSize
outputs.append(cy)
return Apply(self, inputs, outputs)
def grad2(self, inputs, outputs, output_grads):
desc, w, x, hx = inputs[:4]
cx = inputs[4] if len(inputs) == 5 else None
reserve, y, hy = outputs[:3]
_, dy, dhy = output_grads[:3]
dcy = output_grads[3] if len(output_grads) == 4 else None
dinputs = GpuDnnRNNGradInputs()(
desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
reserve2, dx, dhx = dinputs[:3]
dw = GpuDnnRNNGradWeights()(
desc, x, hx, y, reserve2, w)
res = [DisconnectedType()(), dw, dx, dhx]
if cx is not None:
res.append(dinputs[3]) # dcx
return res
def connection_pattern(self, node):
deconn = [[False] * len(node.outputs)]
conn = [[True] * len(node.outputs)] * (len(node.inputs) - 1)
return deconn + conn
class GpuDnnRNNGradInputs(DnnBase):
__props__ = ()
_cop_numi = 10
_cop_numo = 4
def __init__(self):
DnnBase.__init__(self, ['dnn_rnn_gi.c'], 'dnn_rnn_gi')
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
# We trust the callers here
xshp = as_scalar(x.shape[2]).astype('uint64')
inputs = [desc, xshp, y, dy, dhy, w, hx, reserve]
outputs = [reserve.type(), x.type(), hx.type()]
if dcy is not None:
inputs.append(dcy)
inputs.append(cx)
outputs.append(cx.type())
return Apply(self, inputs, outputs)
class GpuDnnRNNGradWeights(DnnBase):
__props__ = ()
def __init__(self):
DnnBase.__init__(self, ['dnn_rnn_gw.c'], 'dnn_rnn_gw')
def make_node(self, desc, x, hx, y, reserve, w):
# We trust the callers here
wsize = as_scalar(w.shape[0]).astype('uint64')
inputs = [desc, wsize, x, hx, y, reserve]
outputs = [w.type()]
return Apply(self, inputs, outputs)
class RNNBlock(object):
def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
input_mode='linear', direction_mode='unidirectional',
dropout=0.0, dropout_seed=4242, context_name=None):
"""
dtype: data type of computation
hidden_size: int
num_layers: int
rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
See cudnn documentation for cudnnRNNMode_t.
input_mode: {'linear', 'skip'}
linear: input will be multiplied by a biased matrix
skip: No operation is performed on the input. The size must match the hidden size.
direction_mode: {'unidirectional', 'bidirectional'}
unidirectional: The network operates recurrently from the
first input to the last.
bidirectional: The operates from first to last then from last to first and concatenates the results at each layer.
"""
ddesc, states = _make_dropout_desc(dropout, dropout_seed, context_name)
self.ddesc = ddesc
self.dstates = states
self.desc = _make_rnn_desc(hidden_size, num_layers,
ddesc, rnn_mode, input_mode,
direction_mode, dtype, context_name)
self.rnn_mode = rnn_mode
self.direction_mode = direction_mode
self.context_name = context_name
self.dtype = dtype
def get_param_size(self, input_size):
bytesize = _get_param_size(self.desc, input_size, self.dtype,
self.context_name)
bytesize = int(bytesize)
assert bytesize % numpy.dtype(self.dtype).itemsize == 0
return bytesize // numpy.dtype(self.dtype).itemsize
def apply(self, w, x, hx, cx=None):
# Don't return the reserve as an output
return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
rnndesc_type.make_constant(self.desc),
w, x, hx, cx, return_list=True)[1:]
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4): epsilon=1e-4):
......
#section support_code
int dnn_dropout_desc(float dropout, unsigned long long seed,
PyGpuContextObject *c,
cudnnDropoutDescriptor_t *odesc,
PyGpuArrayObject **ostates,
cudnnHandle_t _handle) {
PyGpuArrayObject *states;
cudnnDropoutDescriptor_t desc;
size_t states_sz;
cudnnStatus_t err;
cuda_enter(c->ctx);
err = cudnnCreateDropoutDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
cuda_exit(c->ctx);
return -1;
}
/* Can't fail according to docs */
cudnnDropoutGetStatesSize(_handle, &states_sz);
states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
if (states == NULL) {
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
PyGpuArray_DEV_DATA(states),
states_sz, seed);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
Py_DECREF((PyObject *)states);
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
*odesc = desc;
*ostates = states;
return 0;
}
#section support_code
int dnn_dropout_fwd(PyGpuArrayObject *x,
cudnnDropoutDescriptor_t *desc,
PyGpuArrayObject *state,
PyGpuArrayObject **y,
PyGpuArrayObject **ostate,
gpudata **reserve,
cudnnHandle_t _handle) {
PyGpuArrayContext *c = x->context;
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
gpudata *res;
size_t res_sz;
cudnnStatus_t err;
if (c_make_tensorNd(x, &xdesc))
return -1;
if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
GA_C_ORDER, c)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
if (c_make_tensorNd(y, &ydesc)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
*ostate = state;
Py_INCREF((PyObject *)state);
/* This can't fail according to the docs */
err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
if (res == NULL) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
}
*reserve = res;
cuda_enter(c->ctx);
err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
res_sz);
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not run dropout: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
return 0;
}
#section support_code
int dnn_rnn_desc(int hidden_size, int num_layers,
cudnnDropoutDescriptor_t ddesc,
int input_mode, int direction_mode, int rnn_mode,
int dtype, cudnnRNNDescriptor_t *odesc,
cudnnHandle_t _handle) {
cudnnRNNDescriptor_t desc;
cudnnDataType_t data_type;
cudnnStatus_t err;
switch (dtype) {
case GA_FLOAT:
data_type = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
data_type = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
data_type = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateRNNDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
return -1;
}
err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, data_type);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyRNNDescriptor(desc);
PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
return -1;
}
*odesc = desc;
return 0;
}
#section support_code
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
PyGpuArrayObject *w, PyGpuArrayObject *x,
PyGpuArrayObject *hx, PyGpuArrayObject *cx,
gpudata **reserve, PyGpuArrayObject **y,
PyGpuArrayObject **hy, PyGpuArrayObject **cy,
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t hydesc = NULL;
cudnnTensorDescriptor_t cydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(x, 0);
size_t miniBatch = PyGpuArray_DIM(x, 1);
size_t inputSize = PyGpuArray_DIM(x, 2);
size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
size_t shape[3];
int strs[3], dims[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[1] * dims[2];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = hiddenSizeDir;
if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
hx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*hy, &hydesc) != 0)
goto fail;
if (cy != NULL) {
if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
cx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*cy, &cydesc) != 0)
goto fail;
}
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*reserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
yl, PyGpuArray_DEV_DATA(*y),
hydesc, PyGpuArray_DEV_DATA(*hy),
cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
*(void **)workspace, worksize,
*(void **)(*reserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (hydesc != NULL)
cudnnDestroyTensorDescriptor(hydesc);
if (cydesc != NULL)
cudnnDestroyTensorDescriptor(cydesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
PyGpuArrayObject *y, PyGpuArrayObject *dy,
PyGpuArrayObject *dhy, PyGpuArrayObject *w,
PyGpuArrayObject *hx, gpudata *reserve, PyGpuArrayObject *dcy,
PyGpuArrayObject *cx, gpudata **oreserve,
PyGpuArrayObject **dx, PyGpuArrayObject **dhx,
PyGpuArrayObject **dcx, cudnnHandle_t _handle) {
PyGpuContextObject *c = y->context;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t dhydesc = NULL;
cudnnTensorDescriptor_t dcydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t dxdesc = NULL;
cudnnTensorDescriptor_t dhxdesc = NULL;
cudnnTensorDescriptor_t dcxdesc = NULL;
cudnnTensorDescriptor_t *yl = NULL;
cudnnTensorDescriptor_t *dxl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(y, 0);
size_t miniBatch = PyGpuArray_DIM(y, 1);
size_t inputSize = xshp;
size_t shape[3];
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (y->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
return -1;
}
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(dhy, &dhydesc) != 0)
goto fail;
if (dcy != NULL)
if (c_make_tensorNd(dcy, &dcydesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = inputSize;
if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&dxdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
goto fail;
if (cx != NULL) {
if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
goto fail;
}
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (dxl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
dxl[i] = dxdesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
dxl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*oreserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
goto fail;
}
err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
yl, PyGpuArray_DEV_DATA(y),
/* y and dy are the same shape */
yl, PyGpuArray_DEV_DATA(dy),
dhydesc, PyGpuArray_DEV_DATA(dhy),
dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
dxl, PyGpuArray_DEV_DATA(*dx),
dhxdesc, PyGpuArray_DEV_DATA(*dhx),
dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
*(void **)workspace, worksize,
*(void **)(*oreserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad inputs: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dhydesc != NULL)
cudnnDestroyTensorDescriptor(dhydesc);
if (dcydesc != NULL)
cudnnDestroyTensorDescriptor(dcydesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (dxdesc != NULL)
cudnnDestroyTensorDescriptor(dxdesc);
if (dhxdesc != NULL)
cudnnDestroyTensorDescriptor(dhxdesc);
if (dcxdesc != NULL)
cudnnDestroyTensorDescriptor(dcxdesc);
free(yl);
free(dxl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
PyGpuArrayObject *x, PyGpuArrayObject *hx,
PyGpuArrayObject *y, gpudata *reserve,
PyGpuArrayObject **dw, cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnFilterDescriptor_t dwdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t iters = PyGpuArray_DIM(x, 0);
size_t wsize = _wsize;
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_filter(*dw, &dwdesc) != 0)
goto fail;
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
yl, PyGpuArray_DEV_DATA(y),
*(void **)workspace, worksize,
dwdesc, PyGpuArray_DEV_DATA(*dw),
*(void **)reserve, ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad weights: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dwdesc != NULL)
cudnnDestroyFilterDescriptor(dwdesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
PyArrayObject *isize,
npy_int32 typecode,
npy_uint64 *oparam_size,
cudnnHandle_t _handle) {
cudnnTensorDescriptor_t xdesc;
size_t param_size;
cudnnStatus_t err;
cudnnDataType_t dt;
int shape[3];
int strides[3];
if (PyArray_DIM(isize, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
return -1;
}
switch (typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
return -1;
}
shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
shape[2] = 1;
strides[0] = shape[2] * shape[1];
strides[1] = shape[2];
strides[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
return -1;
}
cudnnDestroyTensorDescriptor(xdesc);
*oparam_size = param_size;
return 0;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论