提交 3007bf79 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #4915 from abergeron/dnn_rnn2

Cudnn RNN bindings.
......@@ -14,11 +14,9 @@ import theano.tensor as T
def test_NanGuardMode():
"""
Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
intentionally. A working implementation should be able to capture all
the abnormalties.
"""
# Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
# intentionally. A working implementation should be able to capture all
# the abnormalties.
x = T.matrix()
w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
y = T.dot(x, w)
......
......@@ -10,7 +10,7 @@ import sys
import warnings
from functools import wraps
from six import StringIO, PY3
from six import StringIO, PY3, iteritems
import theano
from theano.compat import configparser as ConfigParser
......@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser()
theano_raw_cfg.read(config_files)
def change_flags(**kwargs):
class change_flags(object):
"""
Use this as a decorator to change the value of Theano config variable.
Use this as a decorator or context manager to change the value of
Theano config variables.
Useful during tests.
"""
def change_flags_exec(f):
def __init__(self, **kwargs):
confs = dict()
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
confs[k] = l[0]
self.confs = confs
self.new_vals = kwargs
def __call__(self, f):
@wraps(f)
def inner(*args, **kwargs_):
old_val = {}
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
old_val[k] = l[0].__get__(True, None)
try:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
l[0].__set__(None, kwargs[k])
return f(*args, **kwargs_)
finally:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
l[0].__set__(None, old_val[k])
return inner
return change_flags_exec
def res(*args, **kwargs):
with self:
return f(*args, **kwargs)
return res
def __enter__(self):
self.old_vals = {}
for k, v in iteritems(self.confs):
self.old_vals[k] = v.__get__(True, None)
try:
for k, v in iteritems(self.confs):
v.__set__(None, self.new_vals[k])
except:
self.__exit__()
raise
def __exit__(self, *args):
for k, v in iteritems(self.confs):
v.__set__(None, self.old_vals[k])
def fetch_val_for_key(key, delete_key=False):
......
......@@ -696,6 +696,9 @@ class PureOp(object):
# Python implementation #
#########################
def L_op(self, inputs, outputs, output_grads):
return self.grad(inputs, output_grads)
def R_op(self, inputs, eval_points):
"""
This method is primarily used by tensor.Rop
......
......@@ -14,6 +14,7 @@ import theano
from theano.gof import utils
from theano.gof.utils import MethodNotDefined, object2
from theano.gof import graph
from theano.configparser import change_flags
########
# Type #
......@@ -638,6 +639,8 @@ class CDataType(Type):
have a `void` return and take a single pointer argument.
"""
__props__ = ('ctype', 'freefunc', 'headers', 'header_dirs',
'libraries', 'lib_dirs', 'extra_support_code')
def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None,
libraries=None, lib_dirs=None, extra_support_code=""):
......@@ -647,42 +650,51 @@ class CDataType(Type):
assert isinstance(freefunc, string_types)
self.freefunc = freefunc
if headers is None:
headers = []
self.headers = headers
headers = ()
self.headers = tuple(headers)
if header_dirs is None:
header_dirs = []
self.header_dirs = header_dirs
header_dirs = ()
self.header_dirs = tuple(header_dirs)
if libraries is None:
libraries = []
self.libraries = libraries
libraries = ()
self.libraries = tuple(libraries)
if lib_dirs is None:
lib_dirs = []
self.lib_dirs = lib_dirs
lib_dirs = ()
self.lib_dirs = tuple(lib_dirs)
self.extra_support_code = extra_support_code
self._fn = None
def __eq__(self, other):
return (type(self) == type(other) and
self.ctype == other.ctype and
self.freefunc == other.freefunc)
def __hash__(self):
return hash((type(self), self.ctype, self.freefunc))
def filter(self, data, strict=False, allow_downcast=None):
if data is not None and not isinstance(data, _cdata_type):
raise TypeError("expected None or a PyCapsule")
return data
def _get_func(self):
"""
Return a function that makes a value from an integer.
The integer value is assumed to be a valid pointer for the
type and no check is done to ensure that.
"""
from theano.scalar import get_scalar_type
if self._fn is None:
v = get_scalar_type('int64')()
self._fn = theano.function([v], _make_cdata(self)(v), profile=False)
with change_flags(compute_test_value='off'):
v = get_scalar_type('int64')()
self._fn = theano.function([v], _make_cdata(self)(v),
profile=False)
return self._fn
def make_value(self, ptr):
"""
Make a value of this type.
Parameters
----------
ptr : int
Integer representation of a valid pointer value
"""
return self._get_func()(ptr)
def c_declare(self, name, sub, check_input=True):
......
......@@ -26,7 +26,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
reg_context, get_context, ContextNotDefined, _get_props)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial
......@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
(name, dev, context.devname),
file=sys.stderr)
pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'):
try:
cudnn_version = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if cudnn_version >= 5200:
warnings.warn("Your cuDNN version is more recent than Theano."
" If you see problems, try updating Theano or"
" downgrading cuDNN to version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(cudnn_version, name), file=sys.stderr)
except Exception:
pass
if 'cudnn_version' not in ctx_props:
try:
ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
......
差异被折叠。
......@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
#section init_code
setup_ext_cuda();
#section support_code_struct
PyGpuContextObject *ctx;
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{
// We need to keep a reference here to have it available in the destructor.
ctx = PARAMS;
Py_INCREF(ctx);
cuda_enter(PARAMS->ctx);
cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
cuda_exit(PARAMS->ctx);
}
#section cleanup_code_struct
cuda_enter(ctx->ctx);
cudnnDestroy(APPLY_SPECIFIC(_handle));
cuda_exit(ctx->ctx);
Py_DECREF((PyObject *)ctx);
......@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd, PyGpuContextObject *c) {
PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
......@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alpha,
beta,
......
......@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, PyGpuContextObject *c) {
PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(doutp, bn_doutput) != 0)
......@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alphaData,
betaData,
......
......@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuContextObject *c) {
PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
......@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alpha,
beta,
......
#section support_code
int dnn_dropout_desc(float dropout, unsigned long long seed,
PyGpuContextObject *c,
cudnnDropoutDescriptor_t *odesc,
PyGpuArrayObject **ostates,
cudnnHandle_t _handle) {
PyGpuArrayObject *states;
cudnnDropoutDescriptor_t desc;
size_t states_sz;
cudnnStatus_t err;
cuda_enter(c->ctx);
err = cudnnCreateDropoutDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
cuda_exit(c->ctx);
return -1;
}
/* Can't fail according to docs */
cudnnDropoutGetStatesSize(_handle, &states_sz);
states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
if (states == NULL) {
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
PyGpuArray_DEV_DATA(states),
states_sz, seed);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
Py_DECREF((PyObject *)states);
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
*odesc = desc;
*ostates = states;
return 0;
}
#section support_code
int dnn_dropout_fwd(PyGpuArrayObject *x,
cudnnDropoutDescriptor_t *desc,
PyGpuArrayObject *state,
PyGpuArrayObject **y,
PyGpuArrayObject **ostate,
gpudata **reserve,
cudnnHandle_t _handle) {
PyGpuArrayContext *c = x->context;
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
gpudata *res;
size_t res_sz;
cudnnStatus_t err;
if (c_make_tensorNd(x, &xdesc))
return -1;
if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
GA_C_ORDER, c)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
if (c_make_tensorNd(y, &ydesc)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
*ostate = state;
Py_INCREF((PyObject *)state);
/* This can't fail according to the docs */
err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
if (res == NULL) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
}
*reserve = res;
cuda_enter(c->ctx);
err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
res_sz);
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not run dropout: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
return 0;
}
......@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
......@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
cudnnHandle_t _handle) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
......@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
_handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
......@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
_handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
#section support_code
int dnn_rnn_desc(int hidden_size, int num_layers,
cudnnDropoutDescriptor_t ddesc,
int input_mode, int direction_mode, int rnn_mode,
int dtype, cudnnRNNDescriptor_t *odesc,
cudnnHandle_t _handle) {
cudnnRNNDescriptor_t desc;
cudnnDataType_t data_type;
cudnnStatus_t err;
switch (dtype) {
case GA_FLOAT:
data_type = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
data_type = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
data_type = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateRNNDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
return -1;
}
err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, data_type);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyRNNDescriptor(desc);
PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
return -1;
}
*odesc = desc;
return 0;
}
#section support_code
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
PyGpuArrayObject *w, PyGpuArrayObject *x,
PyGpuArrayObject *hx, PyGpuArrayObject *cx,
gpudata **reserve, PyGpuArrayObject **y,
PyGpuArrayObject **hy, PyGpuArrayObject **cy,
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t hydesc = NULL;
cudnnTensorDescriptor_t cydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(x, 0);
size_t miniBatch = PyGpuArray_DIM(x, 1);
size_t inputSize = PyGpuArray_DIM(x, 2);
size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
size_t shape[3];
int strs[3], dims[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[1] * dims[2];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = hiddenSizeDir;
if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
hx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*hy, &hydesc) != 0)
goto fail;
if (cy != NULL) {
if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
cx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*cy, &cydesc) != 0)
goto fail;
}
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*reserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
yl, PyGpuArray_DEV_DATA(*y),
hydesc, PyGpuArray_DEV_DATA(*hy),
cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
*(void **)workspace, worksize,
*(void **)(*reserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (hydesc != NULL)
cudnnDestroyTensorDescriptor(hydesc);
if (cydesc != NULL)
cudnnDestroyTensorDescriptor(cydesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
PyGpuArrayObject *y, PyGpuArrayObject *dy,
PyGpuArrayObject *w, PyGpuArrayObject *hx,
gpudata *reserve, PyGpuArrayObject *cx,
PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
gpudata **oreserve, PyGpuArrayObject **dx,
PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
cudnnHandle_t _handle) {
PyGpuContextObject *c = y->context;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t dhydesc = NULL;
cudnnTensorDescriptor_t dcydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t dxdesc = NULL;
cudnnTensorDescriptor_t dhxdesc = NULL;
cudnnTensorDescriptor_t dcxdesc = NULL;
cudnnTensorDescriptor_t *yl = NULL;
cudnnTensorDescriptor_t *dxl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(y, 0);
size_t miniBatch = PyGpuArray_DIM(y, 1);
size_t inputSize = xshp;
size_t shape[3];
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (y->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
return -1;
}
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (dhy != NULL)
if (c_make_tensorNd(dhy, &dhydesc) != 0)
goto fail;
if (dcy != NULL)
if (c_make_tensorNd(dcy, &dcydesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = inputSize;
if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&dxdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
goto fail;
if (cx != NULL) {
if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
goto fail;
}
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (dxl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
dxl[i] = dxdesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
dxl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*oreserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
goto fail;
}
err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
yl, PyGpuArray_DEV_DATA(y),
/* y and dy are the same shape */
yl, PyGpuArray_DEV_DATA(dy),
dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
dxl, PyGpuArray_DEV_DATA(*dx),
dhxdesc, PyGpuArray_DEV_DATA(*dhx),
dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
*(void **)workspace, worksize,
*(void **)(*oreserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad inputs: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dhydesc != NULL)
cudnnDestroyTensorDescriptor(dhydesc);
if (dcydesc != NULL)
cudnnDestroyTensorDescriptor(dcydesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (dxdesc != NULL)
cudnnDestroyTensorDescriptor(dxdesc);
if (dhxdesc != NULL)
cudnnDestroyTensorDescriptor(dhxdesc);
if (dcxdesc != NULL)
cudnnDestroyTensorDescriptor(dcxdesc);
free(yl);
free(dxl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
PyGpuArrayObject *x, PyGpuArrayObject *hx,
PyGpuArrayObject *y, gpudata *reserve,
PyGpuArrayObject **dw, cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnFilterDescriptor_t dwdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t iters = PyGpuArray_DIM(x, 0);
size_t wsize = _wsize;
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
GpuArray_memset(&(*dw)->ga, 0);
if (c_make_filter(*dw, &dwdesc) != 0)
goto fail;
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
yl, PyGpuArray_DEV_DATA(y),
*(void **)workspace, worksize,
dwdesc, PyGpuArray_DEV_DATA(*dw),
*(void **)reserve, ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad weights: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dwdesc != NULL)
cudnnDestroyFilterDescriptor(dwdesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
PyArrayObject *isize,
npy_int32 typecode,
npy_uint64 *oparam_size,
cudnnHandle_t _handle) {
cudnnTensorDescriptor_t xdesc;
size_t param_size;
cudnnStatus_t err;
cudnnDataType_t dt;
int shape[3];
int strides[3];
if (PyArray_DIM(isize, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
return -1;
}
switch (typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
return -1;
}
shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
shape[2] = 1;
strides[0] = shape[2] * shape[1];
strides[1] = shape[2];
strides[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
return -1;
}
cudnnDestroyTensorDescriptor(xdesc);
*oparam_size = param_size;
return 0;
}
......@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnStatus_t err;
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
......@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle),
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
......
......@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = dy->context;
cudnnStatus_t err;
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
......@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle),
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
......
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor as T
import numpy
class Model(object):
def __init__(self, name=""):
self.name = name
self.layers = []
self.params = []
self.other_updates = {}
def add_layer(self, layer):
self.layers.append(layer)
for p in layer.params:
self.params.append(p)
if hasattr(layer, 'other_updates'):
for y in layer.other_updates:
self.other_updates[y[0]] = y[1]
def get_params(self):
return self.params
def uniform(stdev, size):
"""uniform distribution with the given stdev and size"""
return numpy.random.uniform(
low=-stdev * numpy.sqrt(3),
high=stdev * numpy.sqrt(3),
size=size
).astype(theano.config.floatX)
def linear_transform_weights(input_dim, output_dim,
param_list=None, name=""):
"theano shared variable given input and output dimension"
weight_inialization = uniform(numpy.sqrt(2.0 / input_dim),
(input_dim, output_dim))
W = theano.shared(weight_inialization, name=name)
assert(param_list is not None)
param_list.append(W)
return W
def bias_weights(length, param_list=None, name=""):
"theano shared variable for bias unit, given length"
bias_initialization = numpy.zeros(length).astype(theano.config.floatX)
bias = theano.shared(
bias_initialization,
name=name
)
if param_list is not None:
param_list.append(bias)
return bias
class Layer(object):
'''Generic Layer Template which all layers should inherit'''
def __init__(self, name=""):
self.name = name
self.params = []
def get_params(self):
return self.params
class GRU(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_r = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_r")
self.b_wr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wr")
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_h = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_h")
self.b_wh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wh")
self.R_r = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_r")
self.b_rr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rr")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ru = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ru")
self.R_h = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_h")
self.b_rh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rh")
'''step through processed input to create output'''
def step(inp, s_prev):
i_t = T.nnet.sigmoid(
T.dot(inp, self.W_i) + T.dot(s_prev, self.R_i) + self.b_wi + self.b_ru)
r_t = T.nnet.sigmoid(
T.dot(inp, self.W_r) + T.dot(s_prev, self.R_r) + self.b_wr + self.b_rr)
h_hat_t = T.tanh(
T.dot(inp, self.W_h) + (r_t * (T.dot(s_prev, self.R_h) + self.b_rh)) + self.b_wh)
s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
return s_curr
outputs_info = self.s0
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states
def output(self):
return self.Y
class LSTM(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None,
name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.c0 = c0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_f = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_f")
self.b_wf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wf")
self.W_c = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_c")
self.b_wc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wc")
self.W_o = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_o")
self.b_wo = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wo")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ri = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ri")
self.R_f = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_f")
self.b_rf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rf")
self.R_c = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_c")
self.b_rc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rc")
self.R_o = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_o")
self.b_ro = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ro")
'''step through processed input to create output'''
def step(x_t, h_tm1, c_tm1):
i_t = T.nnet.sigmoid(
T.dot(x_t, self.W_i) + T.dot(h_tm1, self.R_i) + self.b_wi + self.b_ri)
f_t = T.nnet.sigmoid(
T.dot(x_t, self.W_f) + T.dot(h_tm1, self.R_f) + self.b_wf + self.b_rf)
o_t = T.nnet.sigmoid(
T.dot(x_t, self.W_o) + T.dot(h_tm1, self.R_o) + self.b_ro + self.b_wo)
c_hat_t = T.tanh(
T.dot(x_t, self.W_c) + T.dot(h_tm1, self.R_c) + self.b_wc + self.b_rc)
c_t = f_t * c_tm1 + i_t * c_hat_t
h_t = o_t * T.tanh(c_t)
return h_t, c_t
outputs_info = [self.s0, self.c0]
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states[0]
self.C = states[1]
def output(self):
return self.Y
class FC(Layer):
def __init__(self, input_dim, output_dim, input_layer, name=""):
self.input_layer = input_layer
self.name = name
self.params = []
self.input_dim = input_dim
self.output_dim = output_dim
self.X = self.input_layer.output()
self.W = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W")
self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
def output(self):
return T.dot(self.X, self.W) + self.b
class WrapperLayer(Layer):
def __init__(self, X, name=""):
self.params = []
self.name = name
self.X = X
def output(self):
return self.X
......@@ -68,6 +68,7 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name):
......@@ -96,6 +97,26 @@ def list_contexts():
"""
return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method
def _name_for_ctx(ctx):
......
......@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx,
str(o_shape) + " on an output of shape " +
str(g_shape))
input_grads = node.op.grad(inputs, new_output_grads)
input_grads = node.op.L_op(inputs, node.outputs,
new_output_grads)
if input_grads is None:
raise TypeError("%s.grad returned NoneType, "
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论