提交 3007bf79 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #4915 from abergeron/dnn_rnn2

Cudnn RNN bindings.
......@@ -14,11 +14,9 @@ import theano.tensor as T
def test_NanGuardMode():
"""
Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
intentionally. A working implementation should be able to capture all
the abnormalties.
"""
# Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
# intentionally. A working implementation should be able to capture all
# the abnormalties.
x = T.matrix()
w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
y = T.dot(x, w)
......
......@@ -10,7 +10,7 @@ import sys
import warnings
from functools import wraps
from six import StringIO, PY3
from six import StringIO, PY3, iteritems
import theano
from theano.compat import configparser as ConfigParser
......@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser()
theano_raw_cfg.read(config_files)
def change_flags(**kwargs):
class change_flags(object):
"""
Use this as a decorator to change the value of Theano config variable.
Use this as a decorator or context manager to change the value of
Theano config variables.
Useful during tests.
"""
def change_flags_exec(f):
def __init__(self, **kwargs):
confs = dict()
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
confs[k] = l[0]
self.confs = confs
self.new_vals = kwargs
def __call__(self, f):
@wraps(f)
def inner(*args, **kwargs_):
old_val = {}
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
old_val[k] = l[0].__get__(True, None)
try:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
l[0].__set__(None, kwargs[k])
return f(*args, **kwargs_)
finally:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
l[0].__set__(None, old_val[k])
return inner
return change_flags_exec
def res(*args, **kwargs):
with self:
return f(*args, **kwargs)
return res
def __enter__(self):
self.old_vals = {}
for k, v in iteritems(self.confs):
self.old_vals[k] = v.__get__(True, None)
try:
for k, v in iteritems(self.confs):
v.__set__(None, self.new_vals[k])
except:
self.__exit__()
raise
def __exit__(self, *args):
for k, v in iteritems(self.confs):
v.__set__(None, self.old_vals[k])
def fetch_val_for_key(key, delete_key=False):
......
......@@ -696,6 +696,9 @@ class PureOp(object):
# Python implementation #
#########################
def L_op(self, inputs, outputs, output_grads):
return self.grad(inputs, output_grads)
def R_op(self, inputs, eval_points):
"""
This method is primarily used by tensor.Rop
......
......@@ -14,6 +14,7 @@ import theano
from theano.gof import utils
from theano.gof.utils import MethodNotDefined, object2
from theano.gof import graph
from theano.configparser import change_flags
########
# Type #
......@@ -638,6 +639,8 @@ class CDataType(Type):
have a `void` return and take a single pointer argument.
"""
__props__ = ('ctype', 'freefunc', 'headers', 'header_dirs',
'libraries', 'lib_dirs', 'extra_support_code')
def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None,
libraries=None, lib_dirs=None, extra_support_code=""):
......@@ -647,42 +650,51 @@ class CDataType(Type):
assert isinstance(freefunc, string_types)
self.freefunc = freefunc
if headers is None:
headers = []
self.headers = headers
headers = ()
self.headers = tuple(headers)
if header_dirs is None:
header_dirs = []
self.header_dirs = header_dirs
header_dirs = ()
self.header_dirs = tuple(header_dirs)
if libraries is None:
libraries = []
self.libraries = libraries
libraries = ()
self.libraries = tuple(libraries)
if lib_dirs is None:
lib_dirs = []
self.lib_dirs = lib_dirs
lib_dirs = ()
self.lib_dirs = tuple(lib_dirs)
self.extra_support_code = extra_support_code
self._fn = None
def __eq__(self, other):
return (type(self) == type(other) and
self.ctype == other.ctype and
self.freefunc == other.freefunc)
def __hash__(self):
return hash((type(self), self.ctype, self.freefunc))
def filter(self, data, strict=False, allow_downcast=None):
if data is not None and not isinstance(data, _cdata_type):
raise TypeError("expected None or a PyCapsule")
return data
def _get_func(self):
"""
Return a function that makes a value from an integer.
The integer value is assumed to be a valid pointer for the
type and no check is done to ensure that.
"""
from theano.scalar import get_scalar_type
if self._fn is None:
v = get_scalar_type('int64')()
self._fn = theano.function([v], _make_cdata(self)(v), profile=False)
with change_flags(compute_test_value='off'):
v = get_scalar_type('int64')()
self._fn = theano.function([v], _make_cdata(self)(v),
profile=False)
return self._fn
def make_value(self, ptr):
"""
Make a value of this type.
Parameters
----------
ptr : int
Integer representation of a valid pointer value
"""
return self._get_func()(ptr)
def c_declare(self, name, sub, check_input=True):
......
......@@ -26,7 +26,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
reg_context, get_context, ContextNotDefined, _get_props)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial
......@@ -89,19 +89,24 @@ def init_dev(dev, name=None):
(name, dev, context.devname),
file=sys.stderr)
pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'):
try:
cudnn_version = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if cudnn_version >= 5200:
warnings.warn("Your cuDNN version is more recent than Theano."
" If you see problems, try updating Theano or"
" downgrading cuDNN to version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(cudnn_version, name), file=sys.stderr)
except Exception:
pass
if 'cudnn_version' not in ctx_props:
try:
ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final.
if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than "
"Theano. If you encounter problems, try "
"updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device:
print("Using cuDNN version %d on context %s" %
(ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception:
pass
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
......
from __future__ import absolute_import, print_function, division
import ctypes
import os
import sys
import warnings
import numpy
......@@ -7,7 +9,7 @@ from six import integer_types
import theano
from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log
from theano.scalar import as_scalar, constant, Log, get_scalar_type
from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp
......@@ -26,7 +28,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from .type import get_context, gpu_context_type, list_contexts
from .type import (get_context, gpu_context_type, list_contexts,
get_prop, set_prop, GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType)
......@@ -42,6 +45,45 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
try:
from pygpu import gpuarray
except ImportError:
pass
def _dnn_lib():
if _dnn_lib.handle is None:
import ctypes.util
lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll', 'cudnn64_4.dll']:
lib_name = ctypes.util.find_library(name)
if lib_name:
break
if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
cudnn.cudnnCreate.restype = ctypes.c_int
cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
cudnn.cudnnDestroy.restype = ctypes.c_int
return _dnn_lib.handle
_dnn_lib.handle = None
def _make_handle(ctx):
cudnn = _dnn_lib()
handle = ctypes.c_void_p()
with ctx:
err = cudnn.cudnnCreate(ctypes.byref(handle))
if err != 0:
raise RuntimeError("error creating cudnn handle")
return handle
def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
raise RuntimeError(msg)
......@@ -144,6 +186,12 @@ def dnn_available(context_name):
dnn_available.msg = None
handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
headers=['cudnn.h'],
header_dirs=[config.dnn.include_path],
libraries=['cudnn'],
lib_dirs=[config.dnn.library_path])
class DnnBase(COp):
......@@ -154,10 +202,20 @@ class DnnBase(COp):
# dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern.
check_broadcast = False
params_type = gpu_context_type
params_type = handle_type
def dnn_context(self, node):
return node.outputs[0].type.context_name
def get_params(self, node):
return node.outputs[0].type.context
try:
return get_prop(self.dnn_context(node), 'cudnn_handle_param')
except KeyError:
pass
ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
res = handle_type.make_value(ptr)
set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
return res
def __init__(self, files=None, c_func=None):
if files is None:
......@@ -165,9 +223,10 @@ class DnnBase(COp):
COp.__init__(self, ["dnn_base.c"] + files, c_func)
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(),
......@@ -183,7 +242,7 @@ class DnnBase(COp):
return ['-Wl,-rpath,' + config.dnn.library_path]
def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version())
return (super(DnnBase, self).c_code_cache_version(), version(), 1)
class DnnVersion(Op):
......@@ -1734,6 +1793,599 @@ class GpuDnnBatchNormGrad(DnnBase):
def infer_shape(self, node, shape):
return [shape[0], shape[2], shape[2]]
gpudata_type = CDataType('gpudata *', 'gpudata_release')
dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
'cudnnDestroyDropoutDescriptor')
class GpuDnnDropoutOp(DnnBase):
__props__ = ('inplace',)
def __init__(self, inplace=False):
DnnBase.__init__(self, ["dnn_dropout_fwd.c"], "dnn_dropout_fwd")
self.inplace = inplace
if self.inplace:
self.destroy_map = {1: [2]}
def make_node(self, inp, descriptor, state):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
return Apply(self, [inp, descriptor, state],
[inp.type(), state.type(), gpudata_type()])
def prepare_node(self, node, storage_map, compute_map):
assert self.inplace, "GpuDnnDropoutOp not inplace"
class _DropoutDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_dropout_desc.c"], "dnn_dropout_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, dropout, seed, context_name):
dropout = as_scalar(dropout).astype('float32')
seed = as_scalar(seed).astype('uint64')
assert context_name == self.context_name
# This is a dirty hack to pass the context because params is
# occupied by the cudnn handle
context = gpu_context_type.make_constant(get_context(context_name))
return Apply(self, [dropout, seed, context],
[dropoutdesc_type(),
GpuArrayType('uint8', (False,),
context_name=context_name)()])
def c_code_cache_version_apply(self, node):
# disable the cache since we can't pickle contexts
return None
def _make_dropout_desc(dropout, seed, context_name):
desc, states = theano.function(
[],
_DropoutDescriptor(context_name)(dropout, seed, context_name),
theano.Mode(optimizer=None),
profile=False)()
return desc, states
def dropout(x, dropout=0.0, seed=4242):
desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
y, odesc = GpuDnnDropoutOp()(x, desc)
return y, desc, odesc, states
rnndesc_type = CDataType('cudnnRNNDescriptor_t',
'cudnnDestroyRNNDescriptor')
def as_i32(v):
return as_scalar(v).astype('int32')
class _RNNDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
if version() < 5005:
raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
DnnBase.__init__(self, ["dnn_rnn_desc.c"], "dnn_rnn_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, hidden_size, num_layers, ddesc, input_mode,
direction_mode, rnn_mode, dtype):
hidden_size = as_i32(hidden_size)
num_layers = as_i32(num_layers)
assert 5000 < version() < 5200, "Constants only work for cudnn 5, 5.1"
if input_mode == 'linear':
input_mode = as_i32(0)
elif input_mode == 'skip':
input_mode = as_i32(1)
else:
raise ValueError("input_mode")
if direction_mode == 'unidirectional':
direction_mode = as_i32(0)
elif direction_mode == 'bidirectional':
direction_mode = as_i32(1)
else:
raise ValueError("direction_mode")
if rnn_mode == 'rnn_relu':
rnn_mode = as_i32(0)
elif rnn_mode == 'rnn_tanh':
rnn_mode = as_i32(1)
elif rnn_mode == 'lstm':
rnn_mode = as_i32(2)
elif rnn_mode == 'gru':
rnn_mode = as_i32(3)
else:
raise ValueError("rnn_mode")
dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
return Apply(self, [hidden_size, num_layers,
dropoutdesc_type.make_constant(ddesc),
input_mode, direction_mode, rnn_mode, dtype],
[rnndesc_type()])
def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
input_mode, direction_mode, dtype, context_name):
desc = theano.function(
[],
_RNNDescriptor(context_name)(hidden_size, num_layers, ddesc,
input_mode, direction_mode,
rnn_mode, dtype),
theano.Mode(optimizer=None),
profile=False)()
return desc
class _RNNParamSize(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_rnn_paramsize.c"],
"dnn_rnn_paramsize")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, desc, input_size, typecode):
input_size = as_tensor_variable(input_size).astype('uint64')
typecode = as_i32(typecode)
return Apply(self, [rnndesc_type.make_constant(desc), input_size,
typecode],
[get_scalar_type('uint64')()])
def _get_param_size(desc, input_size, dtype, context_name):
typecode = gpuarray.dtype_to_typecode(dtype)
return theano.function(
[],
_RNNParamSize(context_name)(desc, input_size, typecode),
theano.Mode(optimizer=None),
profile=False)()
class _RNNSplitParams(DnnBase):
__props__ = ('rnn_mode',)
def __init__(self, rnn_mode):
DnnBase.__init__(self)
self.rnn_mode = rnn_mode
def make_node(self, w, desc, layer, isize, typecode):
w = as_gpuarray_variable(w, infer_context_name(w))
assert w.ndim == 1
layer = as_scalar(layer).astype('int32')
isize = as_tensor_variable(isize).astype('uint64')
assert isize.ndim == 1
typecode = as_scalar(typecode).astype('int32')
_1d = GpuArrayType(w.type.dtype, [False],
context_name=w.type.context_name)
_2d = GpuArrayType(w.type.dtype, [False, False],
context_name=w.type.context_name)
outputs = []
if self.rnn_mode == 'rnn_relu' or self.rnn_mode == 'rnn_tanh':
outputs.extend([_2d(), _1d()]) # input
outputs.extend([_2d(), _1d()]) # recurrent
elif self.rnn_mode == 'lstm':
outputs.extend([_2d(), _1d()]) # input input
outputs.extend([_2d(), _1d()]) # input forget
outputs.extend([_2d(), _1d()]) # input newmem
outputs.extend([_2d(), _1d()]) # input output
outputs.extend([_2d(), _1d()]) # recur input
outputs.extend([_2d(), _1d()]) # recur forget
outputs.extend([_2d(), _1d()]) # recur newmem
outputs.extend([_2d(), _1d()]) # recur output
elif self.rnn_mode == 'gru':
outputs.extend([_2d(), _1d()]) # input reset
outputs.extend([_2d(), _1d()]) # input update
outputs.extend([_2d(), _1d()]) # input newmem
outputs.extend([_2d(), _1d()]) # recur reset
outputs.extend([_2d(), _1d()]) # recur update
outputs.extend([_2d(), _1d()]) # recur newmem
return Apply(self, [w, layer, rnndesc_type.make_constant(desc),
isize, typecode], outputs)
def c_code(self, node, name, inputs, outputs, sub):
kw = dict(fail=sub['fail'], w=inputs[0], layer=inputs[1],
desc=inputs[2], isize=inputs[3], typecode=inputs[4],
handle=sub['params'])
code = """
cudnnTensorDescriptor_t xdesc;
cudnnFilterDescriptor_t wdesc;
cudnnFilterDescriptor_t odesc;
size_t nshp[2];
void *w;
void *o;
ptrdiff_t off;
size_t bshp;
cudnnStatus_t err;
cudnnDataType_t dt;
cudnnTensorFormat_t tf;
int nd;
int dims[3];
int strs[3];
if (PyArray_DIM(%(isize)s, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
%(fail)s;
}
switch (%(typecode)s) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
%(fail)s;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create xdesc");
%(fail)s;
}
dims[0] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 0);
dims[1] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 1);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %%s",
cudnnGetErrorString(err));
%(fail)s;
}
if (c_make_filter(%(w)s, &wdesc)) {
cudnnDestroyTensorDescriptor(xdesc);
%(fail)s
}
err = cudnnCreateFilterDescriptor(&odesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not create odesc");
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
%(fail)s
}
w = PyGpuArray_DEV_DATA(%(w)s);
nshp[0] = PyGpuArray_DIM(%(w)s, 0);
nshp[1] = 1;
""" % kw
def get_params(id, m, b):
kw2 = kw.copy()
kw2['id'] = id
kw2['m'] = m
kw2['b'] = b
return """
err = cudnnGetRNNLinLayerBiasParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "can't fetch bias for id %(id)s");
%(fail)s
}
off = (intptr_t)o - (intptr_t)w;
assert(off >= 0 && "bias");
err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "could not get bias shape for id %(id)s");
%(fail)s;
}
// We assume that the typecode matches
assert(dims[2] == 1);
assert(dims[1] == 1);
%(b)s = pygpu_view(%(w)s, Py_None);
%(b)s->ga.offset = off;
%(b)s->ga.dimensions[0] = dims[0];
bshp = dims[0];
err = cudnnGetRNNLinLayerMatrixParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "can't fetch matrix for id %(id)s");
%(fail)s
}
off = (intptr_t)o - (intptr_t)w;
assert(off >= 0 && "matrix");
// This is 3d because of cudnn limitations.
err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "could not get matrix shape for id %(id)s");
%(fail)s;
}
assert(dims[1] == 1);
assert(dims[2] == 1);
// We assume that the typecode matches
%(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
%(m)s->ga.offset = off;
assert(dims[0] %% bshp == 0);
%(m)s->ga.dimensions[0] = dims[0] / bshp;
%(m)s->ga.dimensions[1] = bshp;
%(m)s->ga.strides[1] = %(m)s->ga.dimensions[0] * gpuarray_get_elsize(%(m)s->ga.typecode);
""" % kw2
for i in range(len(outputs) // 2):
code += get_params(i, outputs[2 * i], outputs[(2 * i) + 1])
code += """
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
"""
return code
def c_code_cache_version(self):
return (2,)
def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
typecode = gpuarray.dtype_to_typecode(dtype)
outs = _RNNSplitParams(rnn_mode)(w, desc, layer, input_size, typecode)
outs = [theano.Out(o, borrow=True) for o in outs]
return theano.function(
[], outs,
theano.Mode(optimizer=None),
profile=False)()
class GpuDnnRNNOp(DnnBase):
__props__ = ()
_cop_num_inputs = 5
_cop_num_outputs = 4
def __init__(self, rnn_mode, direction_mode):
DnnBase.__init__(self, ["dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
self.rnn_mode = rnn_mode
if direction_mode == 'bidirectional':
self.num_dirs = 2
elif direction_mode == 'unidirectional':
self.num_dirs = 1
else:
raise ValueError('direction_mode is invalid (got %s)' % (direction_mode,))
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, w, x, hx, cx=None):
if cx is None:
context_name = infer_context_name(w, x, hx)
else:
context_name = infer_context_name(w, x, hx, cx)
w = as_gpuarray_variable(w, context_name)
x = as_gpuarray_variable(x, context_name)
hx = as_gpuarray_variable(hx, context_name)
inputs = [desc, w, x, hx]
assert w.ndim == 1
assert x.ndim == 3 # seqLength, minibatch, inputSize
assert hx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi
if self.rnn_mode == 'lstm':
cx = as_gpuarray_variable(cx, context_name)
assert cx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi
inputs.append(cx)
_3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
context_name=context_name)
reserve = gpudata_type()
y = _3d() # seqLength, minibatch, hiddenSize * bidi
hy = _3d() # numLayers, miniBatch, hiddenSize * bidi
outputs = [reserve, y, hy]
if self.rnn_mode == 'lstm':
cy = _3d() # numLayers, miniBatch, hiddenSize * bidi
outputs.append(cy)
return Apply(self, inputs, outputs)
def L_op(self, inputs, outputs, output_grads):
desc, w, x, hx = inputs[:4]
cx = inputs[4] if len(inputs) == 5 else None
reserve, y, hy = outputs[:3]
_, dy, dhy = output_grads[:3]
dcy = output_grads[3] if len(output_grads) == 4 else None
# Since the op return two outputs which contain essentially
# the same information, the user will most likely only use one
# of them. This leads to the situation that the other is
# considered "disconnected" by theano in the gradient.
# However we know that this isn't really the case so we fix it
# here.
# If all the ys are disconnected, then you get a boring
# gradient instead of an error. But in that case you
# shouldn't call this method anyway.
if isinstance(dy.type, DisconnectedType):
dy = as_gpuarray_variable(y.zeros_like(),
context_name=y.type.context_name)
if isinstance(dhy.type, DisconnectedType):
dhy = None
if dcy and isinstance(dcy.type, DisconnectedType):
dcy = None
dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode,
grad_h=(dhy is not None),
grad_c=(dcy is not None))(
desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
reserve2, dx, dhx = dinputs[:3]
dw = GpuDnnRNNGradWeights()(
desc, x, hx, y, reserve2, w)
res = [DisconnectedType()(), dw, dx, dhx]
if cx is not None:
res.append(dinputs[3]) # dcx
return res
def connection_pattern(self, node):
deconn = [[False] * len(node.outputs)]
conn = [[True] * len(node.outputs)] * (len(node.inputs) - 1)
return deconn + conn
class GpuDnnRNNGradInputs(DnnBase):
__props__ = ('rnn_mode', 'grad_c', 'grad_h')
_cop_num_inputs = 10
_cop_num_outputs = 4
def __init__(self, rnn_mode, grad_h, grad_c):
DnnBase.__init__(self, ['dnn_rnn_gi.c'], 'dnn_rnn_gi')
self.rnn_mode = rnn_mode
self.grad_h = grad_h
self.grad_c = grad_c
if self.grad_c:
assert self.rnn_mode == 'lstm'
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
# We trust the callers here
xshp = as_scalar(x.shape[2]).astype('uint64')
inputs = [desc, xshp, y, dy, w, hx, reserve]
outputs = [reserve.type(), x.type(), hx.type()]
if self.rnn_mode == 'lstm':
inputs.append(cx)
outputs.append(cx.type())
if self.grad_h:
inputs.append(dhy)
if self.grad_c:
inputs.append(dcy)
return Apply(self, inputs, outputs)
# We have special requirements so this is hooking into COp
def format_c_function_args(self, inp, out):
rinp = inp[:7]
others = inp[7:]
if self.rnn_mode == 'lstm':
rinp.append(others.pop(0))
else:
rinp.append('NULL')
if self.grad_h:
rinp.append(others.pop(0))
else:
rinp.append('NULL')
if self.grad_c:
rinp.append(others.pop(0))
else:
rinp.append('NULL')
assert len(others) == 0
return COp.format_c_function_args(self, rinp, out)
class GpuDnnRNNGradWeights(DnnBase):
__props__ = ()
def __init__(self):
DnnBase.__init__(self, ['dnn_rnn_gw.c'], 'dnn_rnn_gw')
def make_node(self, desc, x, hx, y, reserve, w):
# We trust the callers here
wsize = as_scalar(w.shape[0]).astype('uint64')
inputs = [desc, wsize, x, hx, y, reserve]
outputs = [w.type()]
return Apply(self, inputs, outputs)
class RNNBlock(object):
def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
input_mode='linear', direction_mode='unidirectional',
context_name=None):
"""
dtype: data type of computation
hidden_size: int
num_layers: int
rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
See cudnn documentation for cudnnRNNMode_t.
input_mode: {'linear', 'skip'}
linear: input will be multiplied by a biased matrix
skip: No operation is performed on the input. The size must match the hidden size.
direction_mode: {'unidirectional', 'bidirectional'}
unidirectional: The network operates recurrently from the
first input to the last.
bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer.
"""
# This is not supported for any value other than 0, so don't change it
ddesc, states = _make_dropout_desc(0, 4242, context_name)
self.ddesc = ddesc
self.dstates = states
self.desc = _make_rnn_desc(hidden_size, num_layers,
ddesc, rnn_mode, input_mode,
direction_mode, dtype, context_name)
self.rnn_mode = rnn_mode
self.direction_mode = direction_mode
self.context_name = context_name
self.dtype = dtype
def get_param_size(self, input_size):
bytesize = _get_param_size(self.desc, input_size, self.dtype,
self.context_name)
bytesize = int(bytesize)
assert bytesize % numpy.dtype(self.dtype).itemsize == 0
return bytesize // numpy.dtype(self.dtype).itemsize
def split_params(self, w, layer, input_size):
if not isinstance(w, GpuArraySharedVariable):
raise TypeError("split_params only works on gpuarray shared variables")
return _split_rnn_params(w, self.desc, layer, input_size, self.dtype, self.rnn_mode)
def apply(self, w, x, hx, cx=None):
# Don't return the reserve as an output
return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
rnndesc_type.make_constant(self.desc),
w, x, hx, cx, return_list=True)[1:]
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4):
......
......@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
#section init_code
setup_ext_cuda();
#section support_code_struct
PyGpuContextObject *ctx;
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{
// We need to keep a reference here to have it available in the destructor.
ctx = PARAMS;
Py_INCREF(ctx);
cuda_enter(PARAMS->ctx);
cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
cuda_exit(PARAMS->ctx);
}
#section cleanup_code_struct
cuda_enter(ctx->ctx);
cudnnDestroy(APPLY_SPECIFIC(_handle));
cuda_exit(ctx->ctx);
Py_DECREF((PyObject *)ctx);
......@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd, PyGpuContextObject *c) {
PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
......@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alpha,
beta,
......
......@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, PyGpuContextObject *c) {
PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(doutp, bn_doutput) != 0)
......@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationBackward(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alphaData,
betaData,
......
......@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuContextObject *c) {
PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0)
return 1;
if (c_set_tensorNd(scale, bn_params) != 0)
......@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta;
}
cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
APPLY_SPECIFIC(_handle),
_handle,
MODE,
alpha,
beta,
......
#section support_code
int dnn_dropout_desc(float dropout, unsigned long long seed,
PyGpuContextObject *c,
cudnnDropoutDescriptor_t *odesc,
PyGpuArrayObject **ostates,
cudnnHandle_t _handle) {
PyGpuArrayObject *states;
cudnnDropoutDescriptor_t desc;
size_t states_sz;
cudnnStatus_t err;
cuda_enter(c->ctx);
err = cudnnCreateDropoutDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
cuda_exit(c->ctx);
return -1;
}
/* Can't fail according to docs */
cudnnDropoutGetStatesSize(_handle, &states_sz);
states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
if (states == NULL) {
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
PyGpuArray_DEV_DATA(states),
states_sz, seed);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
Py_DECREF((PyObject *)states);
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
*odesc = desc;
*ostates = states;
return 0;
}
#section support_code
int dnn_dropout_fwd(PyGpuArrayObject *x,
cudnnDropoutDescriptor_t *desc,
PyGpuArrayObject *state,
PyGpuArrayObject **y,
PyGpuArrayObject **ostate,
gpudata **reserve,
cudnnHandle_t _handle) {
PyGpuArrayContext *c = x->context;
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
gpudata *res;
size_t res_sz;
cudnnStatus_t err;
if (c_make_tensorNd(x, &xdesc))
return -1;
if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
GA_C_ORDER, c)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
if (c_make_tensorNd(y, &ydesc)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
*ostate = state;
Py_INCREF((PyObject *)state);
/* This can't fail according to the docs */
err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
if (res == NULL) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
}
*reserve = res;
cuda_enter(c->ctx);
err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
res_sz);
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not run dropout: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
return 0;
}
......@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle),
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
......@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
......@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = kerns->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
cudnnHandle_t _handle) {
PyGpuContextObject *c = input->context;
void *alpha_p;
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter(
APPLY_SPECIFIC(_handle),
_handle,
alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
......@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
cudnnHandle_t _handle) {
PyGpuContextObject *c = img->context;
size_t dims[5];
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
......@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
_handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta,
......
......@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
......@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool),
_handle, APPLY_SPECIFIC(pool),
alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
#section support_code
int dnn_rnn_desc(int hidden_size, int num_layers,
cudnnDropoutDescriptor_t ddesc,
int input_mode, int direction_mode, int rnn_mode,
int dtype, cudnnRNNDescriptor_t *odesc,
cudnnHandle_t _handle) {
cudnnRNNDescriptor_t desc;
cudnnDataType_t data_type;
cudnnStatus_t err;
switch (dtype) {
case GA_FLOAT:
data_type = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
data_type = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
data_type = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateRNNDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
return -1;
}
err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, data_type);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyRNNDescriptor(desc);
PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
return -1;
}
*odesc = desc;
return 0;
}
#section support_code
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
PyGpuArrayObject *w, PyGpuArrayObject *x,
PyGpuArrayObject *hx, PyGpuArrayObject *cx,
gpudata **reserve, PyGpuArrayObject **y,
PyGpuArrayObject **hy, PyGpuArrayObject **cy,
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t hydesc = NULL;
cudnnTensorDescriptor_t cydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(x, 0);
size_t miniBatch = PyGpuArray_DIM(x, 1);
size_t inputSize = PyGpuArray_DIM(x, 2);
size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
size_t shape[3];
int strs[3], dims[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[1] * dims[2];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = hiddenSizeDir;
if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
hx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*hy, &hydesc) != 0)
goto fail;
if (cy != NULL) {
if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
cx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*cy, &cydesc) != 0)
goto fail;
}
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*reserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
yl, PyGpuArray_DEV_DATA(*y),
hydesc, PyGpuArray_DEV_DATA(*hy),
cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
*(void **)workspace, worksize,
*(void **)(*reserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (hydesc != NULL)
cudnnDestroyTensorDescriptor(hydesc);
if (cydesc != NULL)
cudnnDestroyTensorDescriptor(cydesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
PyGpuArrayObject *y, PyGpuArrayObject *dy,
PyGpuArrayObject *w, PyGpuArrayObject *hx,
gpudata *reserve, PyGpuArrayObject *cx,
PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
gpudata **oreserve, PyGpuArrayObject **dx,
PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
cudnnHandle_t _handle) {
PyGpuContextObject *c = y->context;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t dhydesc = NULL;
cudnnTensorDescriptor_t dcydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t dxdesc = NULL;
cudnnTensorDescriptor_t dhxdesc = NULL;
cudnnTensorDescriptor_t dcxdesc = NULL;
cudnnTensorDescriptor_t *yl = NULL;
cudnnTensorDescriptor_t *dxl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(y, 0);
size_t miniBatch = PyGpuArray_DIM(y, 1);
size_t inputSize = xshp;
size_t shape[3];
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (y->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
return -1;
}
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (dhy != NULL)
if (c_make_tensorNd(dhy, &dhydesc) != 0)
goto fail;
if (dcy != NULL)
if (c_make_tensorNd(dcy, &dcydesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = inputSize;
if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&dxdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
goto fail;
if (cx != NULL) {
if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
goto fail;
}
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (dxl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
dxl[i] = dxdesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
dxl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*oreserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
goto fail;
}
err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
yl, PyGpuArray_DEV_DATA(y),
/* y and dy are the same shape */
yl, PyGpuArray_DEV_DATA(dy),
dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
dxl, PyGpuArray_DEV_DATA(*dx),
dhxdesc, PyGpuArray_DEV_DATA(*dhx),
dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
*(void **)workspace, worksize,
*(void **)(*oreserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad inputs: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dhydesc != NULL)
cudnnDestroyTensorDescriptor(dhydesc);
if (dcydesc != NULL)
cudnnDestroyTensorDescriptor(dcydesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (dxdesc != NULL)
cudnnDestroyTensorDescriptor(dxdesc);
if (dhxdesc != NULL)
cudnnDestroyTensorDescriptor(dhxdesc);
if (dcxdesc != NULL)
cudnnDestroyTensorDescriptor(dcxdesc);
free(yl);
free(dxl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
PyGpuArrayObject *x, PyGpuArrayObject *hx,
PyGpuArrayObject *y, gpudata *reserve,
PyGpuArrayObject **dw, cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnFilterDescriptor_t dwdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t iters = PyGpuArray_DIM(x, 0);
size_t wsize = _wsize;
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
GpuArray_memset(&(*dw)->ga, 0);
if (c_make_filter(*dw, &dwdesc) != 0)
goto fail;
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
yl, PyGpuArray_DEV_DATA(y),
*(void **)workspace, worksize,
dwdesc, PyGpuArray_DEV_DATA(*dw),
*(void **)reserve, ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad weights: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dwdesc != NULL)
cudnnDestroyFilterDescriptor(dwdesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
PyArrayObject *isize,
npy_int32 typecode,
npy_uint64 *oparam_size,
cudnnHandle_t _handle) {
cudnnTensorDescriptor_t xdesc;
size_t param_size;
cudnnStatus_t err;
cudnnDataType_t dt;
int shape[3];
int strides[3];
if (PyArray_DIM(isize, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
return -1;
}
switch (typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
return -1;
}
shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
shape[2] = 1;
strides[0] = shape[2] * shape[1];
strides[1] = shape[2];
strides[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
return -1;
}
cudnnDestroyTensorDescriptor(xdesc);
*oparam_size = param_size;
return 0;
}
......@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnStatus_t err;
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
......@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle),
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
......
......@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx,
PyGpuContextObject *c) {
cudnnHandle_t _handle) {
PyGpuContextObject *c = dy->context;
cudnnStatus_t err;
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
......@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle),
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
alpha,
......
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor as T
import numpy
class Model(object):
def __init__(self, name=""):
self.name = name
self.layers = []
self.params = []
self.other_updates = {}
def add_layer(self, layer):
self.layers.append(layer)
for p in layer.params:
self.params.append(p)
if hasattr(layer, 'other_updates'):
for y in layer.other_updates:
self.other_updates[y[0]] = y[1]
def get_params(self):
return self.params
def uniform(stdev, size):
"""uniform distribution with the given stdev and size"""
return numpy.random.uniform(
low=-stdev * numpy.sqrt(3),
high=stdev * numpy.sqrt(3),
size=size
).astype(theano.config.floatX)
def linear_transform_weights(input_dim, output_dim,
param_list=None, name=""):
"theano shared variable given input and output dimension"
weight_inialization = uniform(numpy.sqrt(2.0 / input_dim),
(input_dim, output_dim))
W = theano.shared(weight_inialization, name=name)
assert(param_list is not None)
param_list.append(W)
return W
def bias_weights(length, param_list=None, name=""):
"theano shared variable for bias unit, given length"
bias_initialization = numpy.zeros(length).astype(theano.config.floatX)
bias = theano.shared(
bias_initialization,
name=name
)
if param_list is not None:
param_list.append(bias)
return bias
class Layer(object):
'''Generic Layer Template which all layers should inherit'''
def __init__(self, name=""):
self.name = name
self.params = []
def get_params(self):
return self.params
class GRU(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_r = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_r")
self.b_wr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wr")
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_h = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_h")
self.b_wh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wh")
self.R_r = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_r")
self.b_rr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rr")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ru = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ru")
self.R_h = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_h")
self.b_rh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rh")
'''step through processed input to create output'''
def step(inp, s_prev):
i_t = T.nnet.sigmoid(
T.dot(inp, self.W_i) + T.dot(s_prev, self.R_i) + self.b_wi + self.b_ru)
r_t = T.nnet.sigmoid(
T.dot(inp, self.W_r) + T.dot(s_prev, self.R_r) + self.b_wr + self.b_rr)
h_hat_t = T.tanh(
T.dot(inp, self.W_h) + (r_t * (T.dot(s_prev, self.R_h) + self.b_rh)) + self.b_wh)
s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
return s_curr
outputs_info = self.s0
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states
def output(self):
return self.Y
class LSTM(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None,
name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.c0 = c0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_f = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_f")
self.b_wf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wf")
self.W_c = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_c")
self.b_wc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wc")
self.W_o = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_o")
self.b_wo = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wo")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ri = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ri")
self.R_f = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_f")
self.b_rf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rf")
self.R_c = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_c")
self.b_rc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rc")
self.R_o = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_o")
self.b_ro = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ro")
'''step through processed input to create output'''
def step(x_t, h_tm1, c_tm1):
i_t = T.nnet.sigmoid(
T.dot(x_t, self.W_i) + T.dot(h_tm1, self.R_i) + self.b_wi + self.b_ri)
f_t = T.nnet.sigmoid(
T.dot(x_t, self.W_f) + T.dot(h_tm1, self.R_f) + self.b_wf + self.b_rf)
o_t = T.nnet.sigmoid(
T.dot(x_t, self.W_o) + T.dot(h_tm1, self.R_o) + self.b_ro + self.b_wo)
c_hat_t = T.tanh(
T.dot(x_t, self.W_c) + T.dot(h_tm1, self.R_c) + self.b_wc + self.b_rc)
c_t = f_t * c_tm1 + i_t * c_hat_t
h_t = o_t * T.tanh(c_t)
return h_t, c_t
outputs_info = [self.s0, self.c0]
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states[0]
self.C = states[1]
def output(self):
return self.Y
class FC(Layer):
def __init__(self, input_dim, output_dim, input_layer, name=""):
self.input_layer = input_layer
self.name = name
self.params = []
self.input_dim = input_dim
self.output_dim = output_dim
self.X = self.input_layer.output()
self.W = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W")
self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
def output(self):
return T.dot(self.X, self.W) + self.b
class WrapperLayer(Layer):
def __init__(self, X, name=""):
self.params = []
self.name = name
self.X = X
def output(self):
return self.X
......@@ -19,6 +19,7 @@ from ..type import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from . import test_nnet
from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
......@@ -1434,3 +1435,249 @@ def test_batchnorm_inference():
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_dnn_rnn_gru():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'gru')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
gru = GRU(last_dim, hidden_dim, last_layer, s0=h0[i, :, :])
model.add_layer(gru)
last_layer = gru
last_dim = hidden_dim
layer_params = gru.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params, hy=None):
cost = 0
if out:
cost += T.mean((Y - out)**2)
if hy:
cost += T.mean(hy**2)
grad = T.grad(cost, [X, h0] + params)
grad_fn = theano.function([X, Y, h0], grad, mode=mode_with_gpu,
on_unused_input='ignore')
return grad_fn
ref_y = last_layer.output()
# This will grab the hy from the scan implementation
ref_hy = T.stack([model.layers[0].Y[-1],
model.layers[1].Y[-1],
model.layers[2].Y[-1]])
y, hy = rnnb.apply(params_cudnn, X, h0)
ref_fn = theano.function([X, h0], ref_y, mode=mode_with_gpu)
cudnn_fn = theano.function([X, h0], y, mode=mode_with_gpu)
# Test with grad connected to y
ref_grad_fn = funcs(ref_y, model.get_params())
cudnn_grad_fn = funcs(y, [params_cudnn])
# Test with grad connected to both y and hy
ref2_grad_fn = funcs(ref_y, model.get_params(), ref_hy)
cudnn2_grad_fn = funcs(y, [params_cudnn], hy)
# Test with grad connected to hy
ref3_grad_fn = funcs(None, model.get_params(), ref_hy)
cudnn3_grad_fn = funcs(None, [params_cudnn], hy)
ref_grad_fns = [ref_grad_fn, ref2_grad_fn, ref3_grad_fn]
cudnn_grad_fns = [cudnn_grad_fn, cudnn2_grad_fn, cudnn3_grad_fn]
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_out = ref_fn(x_val, h0_val)
cudnn_out = cudnn_fn(x_val, h0_val)
utt.assert_allclose(ref_out, cudnn_out)
for ref_grad_fn, cudnn_grad_fn in zip(ref_grad_fns, cudnn_grad_fns):
ref_grads = ref_grad_fn(x_val, y_val, h0_val)
cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
ref_grad_params = ref_grads[2:]
cudnn_grad_params = gpuarray_shared_constructor(cudnn_grads[2])
for i in range(depth):
cudnn_grad_layer = rnnb.split_params(cudnn_grad_params, i,
[batch_size, input_dim])
ref_grad_layer = ref_grad_params[i * len(cudnn_grad_layer):
(i + 1) * len(cudnn_grad_layer)]
for j, g in enumerate(cudnn_grad_layer):
utt.assert_allclose(ref_grad_layer[j], g)
def test_dnn_rnn_lstm():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
model.add_layer(lstm)
last_layer = lstm
last_dim = hidden_dim
layer_params = lstm.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params):
fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
cost = T.mean((Y - out)**2)
grad = T.grad(cost, [X, h0, c0] + params)
grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
return fn, grad_fn
ref_fn, ref_grad_fn = funcs(last_layer.output(),
model.get_params())
cudnn_fn, cudnn_grad_fn = funcs(rnnb.apply(params_cudnn, X, h0, c0)[0],
[params_cudnn])
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_out = ref_fn(x_val, h0_val, c0_val)
cudnn_out = cudnn_fn(x_val, h0_val, c0_val)
utt.assert_allclose(ref_out, cudnn_out)
ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
utt.assert_allclose(ref_grads[2], cudnn_grads[2])
ref_grads_params = ref_grads[3:]
cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
for i in range(depth):
cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
[batch_size, input_dim])
ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
(i + 1) * len(cudnn_grads_layer)]
for j, g in enumerate(cudnn_grads_layer):
utt.assert_allclose(ref_grads_layer[j], g)
def test_dnn_rnn_lstm_grad_c():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
CY = T.tensor3('CY')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
model.add_layer(lstm)
last_layer = lstm
last_dim = hidden_dim
layer_params = lstm.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params):
cost = T.mean((CY - out)**2)
grad = T.grad(cost, [X, h0, c0] + params)
grad_fn = theano.function([X, CY, h0, c0], grad, mode=mode_with_gpu)
return grad_fn
_, _, cy = rnnb.apply(params_cudnn, X, h0, c0)
ref_cy = T.stack([model.layers[0].C[-1],
model.layers[1].C[-1],
model.layers[2].C[-1]])
ref_grad_fn = funcs(ref_cy, model.get_params())
cudnn_grad_fn = funcs(cy, [params_cudnn])
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
cy_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_grads = ref_grad_fn(x_val, cy_val, h0_val, c0_val)
cudnn_grads = cudnn_grad_fn(x_val, cy_val, h0_val, c0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
utt.assert_allclose(ref_grads[2], cudnn_grads[2])
ref_grads_params = ref_grads[3:]
cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
for i in range(depth):
cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
[batch_size, input_dim])
ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
(i + 1) * len(cudnn_grads_layer)]
for j, g in enumerate(cudnn_grads_layer):
utt.assert_allclose(ref_grads_layer[j], g)
......@@ -68,6 +68,7 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name):
......@@ -96,6 +97,26 @@ def list_contexts():
"""
return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method
def _name_for_ctx(ctx):
......
......@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx,
str(o_shape) + " on an output of shape " +
str(g_shape))
input_grads = node.op.grad(inputs, new_output_grads)
input_grads = node.op.L_op(inputs, node.outputs,
new_output_grads)
if input_grads is None:
raise TypeError("%s.grad returned NoneType, "
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论