提交 3007bf79 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #4915 from abergeron/dnn_rnn2

Cudnn RNN bindings.
...@@ -14,11 +14,9 @@ import theano.tensor as T ...@@ -14,11 +14,9 @@ import theano.tensor as T
def test_NanGuardMode(): def test_NanGuardMode():
""" # Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans # intentionally. A working implementation should be able to capture all
intentionally. A working implementation should be able to capture all # the abnormalties.
the abnormalties.
"""
x = T.matrix() x = T.matrix()
w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX)) w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
y = T.dot(x, w) y = T.dot(x, w)
......
...@@ -10,7 +10,7 @@ import sys ...@@ -10,7 +10,7 @@ import sys
import warnings import warnings
from functools import wraps from functools import wraps
from six import StringIO, PY3 from six import StringIO, PY3, iteritems
import theano import theano
from theano.compat import configparser as ConfigParser from theano.compat import configparser as ConfigParser
...@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser() ...@@ -91,37 +91,44 @@ theano_raw_cfg = ConfigParser.RawConfigParser()
theano_raw_cfg.read(config_files) theano_raw_cfg.read(config_files)
def change_flags(**kwargs): class change_flags(object):
""" """
Use this as a decorator to change the value of Theano config variable. Use this as a decorator or context manager to change the value of
Theano config variables.
Useful during tests. Useful during tests.
""" """
def change_flags_exec(f): def __init__(self, **kwargs):
@wraps(f) confs = dict()
def inner(*args, **kwargs_):
old_val = {}
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
old_val[k] = l[0].__get__(True, None)
try:
for k in kwargs: for k in kwargs:
l = [v for v in theano.configparser._config_var_list l = [v for v in theano.configparser._config_var_list
if v.fullname == k] if v.fullname == k]
assert len(l) == 1 assert len(l) == 1
l[0].__set__(None, kwargs[k]) confs[k] = l[0]
return f(*args, **kwargs_) self.confs = confs
finally: self.new_vals = kwargs
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
l[0].__set__(None, old_val[k])
return inner def __call__(self, f):
return change_flags_exec @wraps(f)
def res(*args, **kwargs):
with self:
return f(*args, **kwargs)
return res
def __enter__(self):
self.old_vals = {}
for k, v in iteritems(self.confs):
self.old_vals[k] = v.__get__(True, None)
try:
for k, v in iteritems(self.confs):
v.__set__(None, self.new_vals[k])
except:
self.__exit__()
raise
def __exit__(self, *args):
for k, v in iteritems(self.confs):
v.__set__(None, self.old_vals[k])
def fetch_val_for_key(key, delete_key=False): def fetch_val_for_key(key, delete_key=False):
......
...@@ -696,6 +696,9 @@ class PureOp(object): ...@@ -696,6 +696,9 @@ class PureOp(object):
# Python implementation # # Python implementation #
######################### #########################
def L_op(self, inputs, outputs, output_grads):
return self.grad(inputs, output_grads)
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
""" """
This method is primarily used by tensor.Rop This method is primarily used by tensor.Rop
......
...@@ -14,6 +14,7 @@ import theano ...@@ -14,6 +14,7 @@ import theano
from theano.gof import utils from theano.gof import utils
from theano.gof.utils import MethodNotDefined, object2 from theano.gof.utils import MethodNotDefined, object2
from theano.gof import graph from theano.gof import graph
from theano.configparser import change_flags
######## ########
# Type # # Type #
...@@ -638,6 +639,8 @@ class CDataType(Type): ...@@ -638,6 +639,8 @@ class CDataType(Type):
have a `void` return and take a single pointer argument. have a `void` return and take a single pointer argument.
""" """
__props__ = ('ctype', 'freefunc', 'headers', 'header_dirs',
'libraries', 'lib_dirs', 'extra_support_code')
def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None, def __init__(self, ctype, freefunc=None, headers=None, header_dirs=None,
libraries=None, lib_dirs=None, extra_support_code=""): libraries=None, lib_dirs=None, extra_support_code=""):
...@@ -647,42 +650,51 @@ class CDataType(Type): ...@@ -647,42 +650,51 @@ class CDataType(Type):
assert isinstance(freefunc, string_types) assert isinstance(freefunc, string_types)
self.freefunc = freefunc self.freefunc = freefunc
if headers is None: if headers is None:
headers = [] headers = ()
self.headers = headers self.headers = tuple(headers)
if header_dirs is None: if header_dirs is None:
header_dirs = [] header_dirs = ()
self.header_dirs = header_dirs self.header_dirs = tuple(header_dirs)
if libraries is None: if libraries is None:
libraries = [] libraries = ()
self.libraries = libraries self.libraries = tuple(libraries)
if lib_dirs is None: if lib_dirs is None:
lib_dirs = [] lib_dirs = ()
self.lib_dirs = lib_dirs self.lib_dirs = tuple(lib_dirs)
self.extra_support_code = extra_support_code self.extra_support_code = extra_support_code
self._fn = None self._fn = None
def __eq__(self, other):
return (type(self) == type(other) and
self.ctype == other.ctype and
self.freefunc == other.freefunc)
def __hash__(self):
return hash((type(self), self.ctype, self.freefunc))
def filter(self, data, strict=False, allow_downcast=None): def filter(self, data, strict=False, allow_downcast=None):
if data is not None and not isinstance(data, _cdata_type): if data is not None and not isinstance(data, _cdata_type):
raise TypeError("expected None or a PyCapsule") raise TypeError("expected None or a PyCapsule")
return data return data
def _get_func(self): def _get_func(self):
"""
Return a function that makes a value from an integer.
The integer value is assumed to be a valid pointer for the
type and no check is done to ensure that.
"""
from theano.scalar import get_scalar_type from theano.scalar import get_scalar_type
if self._fn is None: if self._fn is None:
with change_flags(compute_test_value='off'):
v = get_scalar_type('int64')() v = get_scalar_type('int64')()
self._fn = theano.function([v], _make_cdata(self)(v), profile=False) self._fn = theano.function([v], _make_cdata(self)(v),
profile=False)
return self._fn return self._fn
def make_value(self, ptr): def make_value(self, ptr):
"""
Make a value of this type.
Parameters
----------
ptr : int
Integer representation of a valid pointer value
"""
return self._get_func()(ptr) return self._get_func()(ptr)
def c_declare(self, name, sub, check_input=True): def c_declare(self, name, sub, check_input=True):
......
...@@ -26,7 +26,7 @@ except ImportError: ...@@ -26,7 +26,7 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor, GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined) reg_context, get_context, ContextNotDefined, _get_props)
from .basic_ops import as_gpuarray_variable from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial from . import fft, dnn, opt, nerv, extra_ops, multinomial
...@@ -89,17 +89,22 @@ def init_dev(dev, name=None): ...@@ -89,17 +89,22 @@ def init_dev(dev, name=None):
(name, dev, context.devname), (name, dev, context.devname),
file=sys.stderr) file=sys.stderr)
pygpu_activated = True pygpu_activated = True
ctx_props = _get_props(name)
ctx_props['dev'] = dev
if dev.startswith('cuda'): if dev.startswith('cuda'):
if 'cudnn_version' not in ctx_props:
try: try:
cudnn_version = dnn.version() ctx_props['cudnn_version'] = dnn.version()
# 5200 should not print warning with cudnn 5.1 final. # 5200 should not print warning with cudnn 5.1 final.
if cudnn_version >= 5200: if ctx_props['cudnn_version'] >= 5200:
warnings.warn("Your cuDNN version is more recent than Theano." warnings.warn("Your cuDNN version is more recent than "
" If you see problems, try updating Theano or" "Theano. If you encounter problems, try "
" downgrading cuDNN to version 5.1.") "updating Theano or downgrading cuDNN to "
"version 5.1.")
if config.print_active_device: if config.print_active_device:
print("Using cuDNN version %d on context %s" % print("Using cuDNN version %d on context %s" %
(cudnn_version, name), file=sys.stderr) (ctx_props['cudnn_version'], name), file=sys.stderr)
ctx_props['cudnn_handle'] = dnn._make_handle(context)
except Exception: except Exception:
pass pass
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import ctypes
import os import os
import sys
import warnings import warnings
import numpy import numpy
...@@ -7,7 +9,7 @@ from six import integer_types ...@@ -7,7 +9,7 @@ from six import integer_types
import theano import theano
from theano import Op, Apply, tensor, config, Variable from theano import Op, Apply, tensor, config, Variable
from theano.scalar import as_scalar, constant, Log from theano.scalar import as_scalar, constant, Log, get_scalar_type
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from theano.gradient import DisconnectedType, grad_not_implemented from theano.gradient import DisconnectedType, grad_not_implemented
from theano.gof import Optimizer, local_optimizer, COp from theano.gof import Optimizer, local_optimizer, COp
...@@ -26,7 +28,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -26,7 +28,8 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import get_context, gpu_context_type, list_contexts from .type import (get_context, gpu_context_type, list_contexts,
get_prop, set_prop, GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType) empty_like, GpuArrayType)
...@@ -42,6 +45,45 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u ...@@ -42,6 +45,45 @@ from .opt_util import alpha_merge, output_merge, inplace_allocempty, pad_dims, u
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
try:
from pygpu import gpuarray
except ImportError:
pass
def _dnn_lib():
if _dnn_lib.handle is None:
import ctypes.util
lib_name = ctypes.util.find_library('cudnn')
if lib_name is None and sys.platform == 'win32':
# Update these names when new versions of cudnn are supported.
for name in ['cudnn64_5.dll', 'cudnn64_4.dll']:
lib_name = ctypes.util.find_library(name)
if lib_name:
break
if lib_name is None:
raise RuntimeError('Could not find cudnn library (looked for v4 and v5[.1])')
_dnn_lib.handle = ctypes.cdll.LoadLibrary(lib_name)
cudnn = _dnn_lib.handle
cudnn.cudnnCreate.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
cudnn.cudnnCreate.restype = ctypes.c_int
cudnn.cudnnDestroy.argtypes = [ctypes.c_void_p]
cudnn.cudnnDestroy.restype = ctypes.c_int
return _dnn_lib.handle
_dnn_lib.handle = None
def _make_handle(ctx):
cudnn = _dnn_lib()
handle = ctypes.c_void_p()
with ctx:
err = cudnn.cudnnCreate(ctypes.byref(handle))
if err != 0:
raise RuntimeError("error creating cudnn handle")
return handle
def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"): def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
raise RuntimeError(msg) raise RuntimeError(msg)
...@@ -144,6 +186,12 @@ def dnn_available(context_name): ...@@ -144,6 +186,12 @@ def dnn_available(context_name):
dnn_available.msg = None dnn_available.msg = None
handle_type = CDataType('cudnnHandle_t', 'cudnnDestroy',
headers=['cudnn.h'],
header_dirs=[config.dnn.include_path],
libraries=['cudnn'],
lib_dirs=[config.dnn.library_path])
class DnnBase(COp): class DnnBase(COp):
...@@ -154,10 +202,20 @@ class DnnBase(COp): ...@@ -154,10 +202,20 @@ class DnnBase(COp):
# dnn does not know about broadcasting, so we do not need to assert # dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
params_type = gpu_context_type params_type = handle_type
def dnn_context(self, node):
return node.outputs[0].type.context_name
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context try:
return get_prop(self.dnn_context(node), 'cudnn_handle_param')
except KeyError:
pass
ptr = get_prop(self.dnn_context(node), 'cudnn_handle').value
res = handle_type.make_value(ptr)
set_prop(self.dnn_context(node), 'cudnn_handle_param', res)
return res
def __init__(self, files=None, c_func=None): def __init__(self, files=None, c_func=None):
if files is None: if files is None:
...@@ -165,9 +223,10 @@ class DnnBase(COp): ...@@ -165,9 +223,10 @@ class DnnBase(COp):
COp.__init__(self, ["dnn_base.c"] + files, c_func) COp.__init__(self, ["dnn_base.c"] + files, c_func)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h', return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/kernel.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/util.h', 'gpuarray/ext_cuda.h', 'gpuarray_api.h',
'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h'] 'numpy_compat.h', 'cudnn.h', 'cudnn_helper.h',
'gpuarray_helper.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(), return [os.path.dirname(__file__), pygpu.get_include(),
...@@ -183,7 +242,7 @@ class DnnBase(COp): ...@@ -183,7 +242,7 @@ class DnnBase(COp):
return ['-Wl,-rpath,' + config.dnn.library_path] return ['-Wl,-rpath,' + config.dnn.library_path]
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version()) return (super(DnnBase, self).c_code_cache_version(), version(), 1)
class DnnVersion(Op): class DnnVersion(Op):
...@@ -1734,6 +1793,599 @@ class GpuDnnBatchNormGrad(DnnBase): ...@@ -1734,6 +1793,599 @@ class GpuDnnBatchNormGrad(DnnBase):
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0], shape[2], shape[2]] return [shape[0], shape[2], shape[2]]
gpudata_type = CDataType('gpudata *', 'gpudata_release')
dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
'cudnnDestroyDropoutDescriptor')
class GpuDnnDropoutOp(DnnBase):
__props__ = ('inplace',)
def __init__(self, inplace=False):
DnnBase.__init__(self, ["dnn_dropout_fwd.c"], "dnn_dropout_fwd")
self.inplace = inplace
if self.inplace:
self.destroy_map = {1: [2]}
def make_node(self, inp, descriptor, state):
ctx_name = infer_context_name(inp)
inp = as_gpuarray_variable(inp, ctx_name)
return Apply(self, [inp, descriptor, state],
[inp.type(), state.type(), gpudata_type()])
def prepare_node(self, node, storage_map, compute_map):
assert self.inplace, "GpuDnnDropoutOp not inplace"
class _DropoutDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_dropout_desc.c"], "dnn_dropout_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, dropout, seed, context_name):
dropout = as_scalar(dropout).astype('float32')
seed = as_scalar(seed).astype('uint64')
assert context_name == self.context_name
# This is a dirty hack to pass the context because params is
# occupied by the cudnn handle
context = gpu_context_type.make_constant(get_context(context_name))
return Apply(self, [dropout, seed, context],
[dropoutdesc_type(),
GpuArrayType('uint8', (False,),
context_name=context_name)()])
def c_code_cache_version_apply(self, node):
# disable the cache since we can't pickle contexts
return None
def _make_dropout_desc(dropout, seed, context_name):
desc, states = theano.function(
[],
_DropoutDescriptor(context_name)(dropout, seed, context_name),
theano.Mode(optimizer=None),
profile=False)()
return desc, states
def dropout(x, dropout=0.0, seed=4242):
desc, states = _make_dropout_desc(dropout, seed, x.type.context_name)
y, odesc = GpuDnnDropoutOp()(x, desc)
return y, desc, odesc, states
rnndesc_type = CDataType('cudnnRNNDescriptor_t',
'cudnnDestroyRNNDescriptor')
def as_i32(v):
return as_scalar(v).astype('int32')
class _RNNDescriptor(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
if version() < 5005:
raise RuntimeError("cudnn RNN require cudnn v5 final or higher.")
DnnBase.__init__(self, ["dnn_rnn_desc.c"], "dnn_rnn_desc")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, hidden_size, num_layers, ddesc, input_mode,
direction_mode, rnn_mode, dtype):
hidden_size = as_i32(hidden_size)
num_layers = as_i32(num_layers)
assert 5000 < version() < 5200, "Constants only work for cudnn 5, 5.1"
if input_mode == 'linear':
input_mode = as_i32(0)
elif input_mode == 'skip':
input_mode = as_i32(1)
else:
raise ValueError("input_mode")
if direction_mode == 'unidirectional':
direction_mode = as_i32(0)
elif direction_mode == 'bidirectional':
direction_mode = as_i32(1)
else:
raise ValueError("direction_mode")
if rnn_mode == 'rnn_relu':
rnn_mode = as_i32(0)
elif rnn_mode == 'rnn_tanh':
rnn_mode = as_i32(1)
elif rnn_mode == 'lstm':
rnn_mode = as_i32(2)
elif rnn_mode == 'gru':
rnn_mode = as_i32(3)
else:
raise ValueError("rnn_mode")
dtype = as_i32(gpuarray.dtype_to_typecode(dtype))
return Apply(self, [hidden_size, num_layers,
dropoutdesc_type.make_constant(ddesc),
input_mode, direction_mode, rnn_mode, dtype],
[rnndesc_type()])
def _make_rnn_desc(hidden_size, num_layers, ddesc, rnn_mode,
input_mode, direction_mode, dtype, context_name):
desc = theano.function(
[],
_RNNDescriptor(context_name)(hidden_size, num_layers, ddesc,
input_mode, direction_mode,
rnn_mode, dtype),
theano.Mode(optimizer=None),
profile=False)()
return desc
class _RNNParamSize(DnnBase):
__props__ = ('context_name',)
def __init__(self, context_name):
DnnBase.__init__(self, ["dnn_rnn_paramsize.c"],
"dnn_rnn_paramsize")
self.context_name = context_name
def dnn_context(self, node):
return self.context_name
def do_constant_folding(self, node):
return False
def make_node(self, desc, input_size, typecode):
input_size = as_tensor_variable(input_size).astype('uint64')
typecode = as_i32(typecode)
return Apply(self, [rnndesc_type.make_constant(desc), input_size,
typecode],
[get_scalar_type('uint64')()])
def _get_param_size(desc, input_size, dtype, context_name):
typecode = gpuarray.dtype_to_typecode(dtype)
return theano.function(
[],
_RNNParamSize(context_name)(desc, input_size, typecode),
theano.Mode(optimizer=None),
profile=False)()
class _RNNSplitParams(DnnBase):
__props__ = ('rnn_mode',)
def __init__(self, rnn_mode):
DnnBase.__init__(self)
self.rnn_mode = rnn_mode
def make_node(self, w, desc, layer, isize, typecode):
w = as_gpuarray_variable(w, infer_context_name(w))
assert w.ndim == 1
layer = as_scalar(layer).astype('int32')
isize = as_tensor_variable(isize).astype('uint64')
assert isize.ndim == 1
typecode = as_scalar(typecode).astype('int32')
_1d = GpuArrayType(w.type.dtype, [False],
context_name=w.type.context_name)
_2d = GpuArrayType(w.type.dtype, [False, False],
context_name=w.type.context_name)
outputs = []
if self.rnn_mode == 'rnn_relu' or self.rnn_mode == 'rnn_tanh':
outputs.extend([_2d(), _1d()]) # input
outputs.extend([_2d(), _1d()]) # recurrent
elif self.rnn_mode == 'lstm':
outputs.extend([_2d(), _1d()]) # input input
outputs.extend([_2d(), _1d()]) # input forget
outputs.extend([_2d(), _1d()]) # input newmem
outputs.extend([_2d(), _1d()]) # input output
outputs.extend([_2d(), _1d()]) # recur input
outputs.extend([_2d(), _1d()]) # recur forget
outputs.extend([_2d(), _1d()]) # recur newmem
outputs.extend([_2d(), _1d()]) # recur output
elif self.rnn_mode == 'gru':
outputs.extend([_2d(), _1d()]) # input reset
outputs.extend([_2d(), _1d()]) # input update
outputs.extend([_2d(), _1d()]) # input newmem
outputs.extend([_2d(), _1d()]) # recur reset
outputs.extend([_2d(), _1d()]) # recur update
outputs.extend([_2d(), _1d()]) # recur newmem
return Apply(self, [w, layer, rnndesc_type.make_constant(desc),
isize, typecode], outputs)
def c_code(self, node, name, inputs, outputs, sub):
kw = dict(fail=sub['fail'], w=inputs[0], layer=inputs[1],
desc=inputs[2], isize=inputs[3], typecode=inputs[4],
handle=sub['params'])
code = """
cudnnTensorDescriptor_t xdesc;
cudnnFilterDescriptor_t wdesc;
cudnnFilterDescriptor_t odesc;
size_t nshp[2];
void *w;
void *o;
ptrdiff_t off;
size_t bshp;
cudnnStatus_t err;
cudnnDataType_t dt;
cudnnTensorFormat_t tf;
int nd;
int dims[3];
int strs[3];
if (PyArray_DIM(%(isize)s, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
%(fail)s;
}
switch (%(typecode)s) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
%(fail)s;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create xdesc");
%(fail)s;
}
dims[0] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 0);
dims[1] = *(npy_uint64 *)PyArray_GETPTR1(%(isize)s, 1);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
PyErr_Format(PyExc_RuntimeError, "Could not set xdesc: %%s",
cudnnGetErrorString(err));
%(fail)s;
}
if (c_make_filter(%(w)s, &wdesc)) {
cudnnDestroyTensorDescriptor(xdesc);
%(fail)s
}
err = cudnnCreateFilterDescriptor(&odesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "could not create odesc");
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
%(fail)s
}
w = PyGpuArray_DEV_DATA(%(w)s);
nshp[0] = PyGpuArray_DIM(%(w)s, 0);
nshp[1] = 1;
""" % kw
def get_params(id, m, b):
kw2 = kw.copy()
kw2['id'] = id
kw2['m'] = m
kw2['b'] = b
return """
err = cudnnGetRNNLinLayerBiasParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "can't fetch bias for id %(id)s");
%(fail)s
}
off = (intptr_t)o - (intptr_t)w;
assert(off >= 0 && "bias");
err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "could not get bias shape for id %(id)s");
%(fail)s;
}
// We assume that the typecode matches
assert(dims[2] == 1);
assert(dims[1] == 1);
%(b)s = pygpu_view(%(w)s, Py_None);
%(b)s->ga.offset = off;
%(b)s->ga.dimensions[0] = dims[0];
bshp = dims[0];
err = cudnnGetRNNLinLayerMatrixParams(%(handle)s, %(desc)s, %(layer)s, xdesc, wdesc, w, %(id)s, odesc, &o);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "can't fetch matrix for id %(id)s");
%(fail)s
}
off = (intptr_t)o - (intptr_t)w;
assert(off >= 0 && "matrix");
// This is 3d because of cudnn limitations.
err = cudnnGetFilterNdDescriptor(odesc, 3, &dt, &tf, &nd, dims);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
PyErr_SetString(PyExc_RuntimeError, "could not get matrix shape for id %(id)s");
%(fail)s;
}
assert(dims[1] == 1);
assert(dims[2] == 1);
// We assume that the typecode matches
%(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
%(m)s->ga.offset = off;
assert(dims[0] %% bshp == 0);
%(m)s->ga.dimensions[0] = dims[0] / bshp;
%(m)s->ga.dimensions[1] = bshp;
%(m)s->ga.strides[1] = %(m)s->ga.dimensions[0] * gpuarray_get_elsize(%(m)s->ga.typecode);
""" % kw2
for i in range(len(outputs) // 2):
code += get_params(i, outputs[2 * i], outputs[(2 * i) + 1])
code += """
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyFilterDescriptor(wdesc);
cudnnDestroyFilterDescriptor(odesc);
"""
return code
def c_code_cache_version(self):
return (2,)
def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
typecode = gpuarray.dtype_to_typecode(dtype)
outs = _RNNSplitParams(rnn_mode)(w, desc, layer, input_size, typecode)
outs = [theano.Out(o, borrow=True) for o in outs]
return theano.function(
[], outs,
theano.Mode(optimizer=None),
profile=False)()
class GpuDnnRNNOp(DnnBase):
__props__ = ()
_cop_num_inputs = 5
_cop_num_outputs = 4
def __init__(self, rnn_mode, direction_mode):
DnnBase.__init__(self, ["dnn_rnn_fwd.c"], 'dnn_rnn_fwd')
self.rnn_mode = rnn_mode
if direction_mode == 'bidirectional':
self.num_dirs = 2
elif direction_mode == 'unidirectional':
self.num_dirs = 1
else:
raise ValueError('direction_mode is invalid (got %s)' % (direction_mode,))
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, w, x, hx, cx=None):
if cx is None:
context_name = infer_context_name(w, x, hx)
else:
context_name = infer_context_name(w, x, hx, cx)
w = as_gpuarray_variable(w, context_name)
x = as_gpuarray_variable(x, context_name)
hx = as_gpuarray_variable(hx, context_name)
inputs = [desc, w, x, hx]
assert w.ndim == 1
assert x.ndim == 3 # seqLength, minibatch, inputSize
assert hx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi
if self.rnn_mode == 'lstm':
cx = as_gpuarray_variable(cx, context_name)
assert cx.ndim == 3 # numLayers, minibatch, hiddenSize * bidi
inputs.append(cx)
_3d = GpuArrayType(dtype=x.dtype, broadcastable=(False, False, False),
context_name=context_name)
reserve = gpudata_type()
y = _3d() # seqLength, minibatch, hiddenSize * bidi
hy = _3d() # numLayers, miniBatch, hiddenSize * bidi
outputs = [reserve, y, hy]
if self.rnn_mode == 'lstm':
cy = _3d() # numLayers, miniBatch, hiddenSize * bidi
outputs.append(cy)
return Apply(self, inputs, outputs)
def L_op(self, inputs, outputs, output_grads):
desc, w, x, hx = inputs[:4]
cx = inputs[4] if len(inputs) == 5 else None
reserve, y, hy = outputs[:3]
_, dy, dhy = output_grads[:3]
dcy = output_grads[3] if len(output_grads) == 4 else None
# Since the op return two outputs which contain essentially
# the same information, the user will most likely only use one
# of them. This leads to the situation that the other is
# considered "disconnected" by theano in the gradient.
# However we know that this isn't really the case so we fix it
# here.
# If all the ys are disconnected, then you get a boring
# gradient instead of an error. But in that case you
# shouldn't call this method anyway.
if isinstance(dy.type, DisconnectedType):
dy = as_gpuarray_variable(y.zeros_like(),
context_name=y.type.context_name)
if isinstance(dhy.type, DisconnectedType):
dhy = None
if dcy and isinstance(dcy.type, DisconnectedType):
dcy = None
dinputs = GpuDnnRNNGradInputs(rnn_mode=self.rnn_mode,
grad_h=(dhy is not None),
grad_c=(dcy is not None))(
desc, x, y, dy, dhy, dcy, w, hx, cx, reserve, return_list=True)
reserve2, dx, dhx = dinputs[:3]
dw = GpuDnnRNNGradWeights()(
desc, x, hx, y, reserve2, w)
res = [DisconnectedType()(), dw, dx, dhx]
if cx is not None:
res.append(dinputs[3]) # dcx
return res
def connection_pattern(self, node):
deconn = [[False] * len(node.outputs)]
conn = [[True] * len(node.outputs)] * (len(node.inputs) - 1)
return deconn + conn
class GpuDnnRNNGradInputs(DnnBase):
__props__ = ('rnn_mode', 'grad_c', 'grad_h')
_cop_num_inputs = 10
_cop_num_outputs = 4
def __init__(self, rnn_mode, grad_h, grad_c):
DnnBase.__init__(self, ['dnn_rnn_gi.c'], 'dnn_rnn_gi')
self.rnn_mode = rnn_mode
self.grad_h = grad_h
self.grad_c = grad_c
if self.grad_c:
assert self.rnn_mode == 'lstm'
def dnn_context(self, node):
return node.outputs[1].type.context_name
def make_node(self, desc, x, y, dy, dhy, dcy, w, hx, cx, reserve):
# We trust the callers here
xshp = as_scalar(x.shape[2]).astype('uint64')
inputs = [desc, xshp, y, dy, w, hx, reserve]
outputs = [reserve.type(), x.type(), hx.type()]
if self.rnn_mode == 'lstm':
inputs.append(cx)
outputs.append(cx.type())
if self.grad_h:
inputs.append(dhy)
if self.grad_c:
inputs.append(dcy)
return Apply(self, inputs, outputs)
# We have special requirements so this is hooking into COp
def format_c_function_args(self, inp, out):
rinp = inp[:7]
others = inp[7:]
if self.rnn_mode == 'lstm':
rinp.append(others.pop(0))
else:
rinp.append('NULL')
if self.grad_h:
rinp.append(others.pop(0))
else:
rinp.append('NULL')
if self.grad_c:
rinp.append(others.pop(0))
else:
rinp.append('NULL')
assert len(others) == 0
return COp.format_c_function_args(self, rinp, out)
class GpuDnnRNNGradWeights(DnnBase):
__props__ = ()
def __init__(self):
DnnBase.__init__(self, ['dnn_rnn_gw.c'], 'dnn_rnn_gw')
def make_node(self, desc, x, hx, y, reserve, w):
# We trust the callers here
wsize = as_scalar(w.shape[0]).astype('uint64')
inputs = [desc, wsize, x, hx, y, reserve]
outputs = [w.type()]
return Apply(self, inputs, outputs)
class RNNBlock(object):
def __init__(self, dtype, hidden_size, num_layers, rnn_mode,
input_mode='linear', direction_mode='unidirectional',
context_name=None):
"""
dtype: data type of computation
hidden_size: int
num_layers: int
rnn_mode: {'rnn_relu', 'rnn_tanh', 'lstm', 'gru'}
See cudnn documentation for cudnnRNNMode_t.
input_mode: {'linear', 'skip'}
linear: input will be multiplied by a biased matrix
skip: No operation is performed on the input. The size must match the hidden size.
direction_mode: {'unidirectional', 'bidirectional'}
unidirectional: The network operates recurrently from the
first input to the last.
bidirectional: The network operates from first to last then from last to first and concatenates the results at each layer.
"""
# This is not supported for any value other than 0, so don't change it
ddesc, states = _make_dropout_desc(0, 4242, context_name)
self.ddesc = ddesc
self.dstates = states
self.desc = _make_rnn_desc(hidden_size, num_layers,
ddesc, rnn_mode, input_mode,
direction_mode, dtype, context_name)
self.rnn_mode = rnn_mode
self.direction_mode = direction_mode
self.context_name = context_name
self.dtype = dtype
def get_param_size(self, input_size):
bytesize = _get_param_size(self.desc, input_size, self.dtype,
self.context_name)
bytesize = int(bytesize)
assert bytesize % numpy.dtype(self.dtype).itemsize == 0
return bytesize // numpy.dtype(self.dtype).itemsize
def split_params(self, w, layer, input_size):
if not isinstance(w, GpuArraySharedVariable):
raise TypeError("split_params only works on gpuarray shared variables")
return _split_rnn_params(w, self.desc, layer, input_size, self.dtype, self.rnn_mode)
def apply(self, w, x, hx, cx=None):
# Don't return the reserve as an output
return GpuDnnRNNOp(self.rnn_mode, self.direction_mode)(
rnndesc_type.make_constant(self.desc),
w, x, hx, cx, return_list=True)[1:]
def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation', def dnn_batch_normalization_train(inputs, gamma, beta, mode='per-activation',
epsilon=1e-4): epsilon=1e-4):
......
...@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) { ...@@ -149,41 +149,3 @@ static int c_make_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t *desc) {
#section init_code #section init_code
setup_ext_cuda(); setup_ext_cuda();
#section support_code_struct
PyGpuContextObject *ctx;
cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{
// We need to keep a reference here to have it available in the destructor.
ctx = PARAMS;
Py_INCREF(ctx);
cuda_enter(PARAMS->ctx);
cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
if ((err = cudnnSetStream(APPLY_SPECIFIC(_handle),
cuda_get_stream(PARAMS->ctx))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set cudnn stream: %s",
cudnnGetErrorString(err));
cuda_exit(PARAMS->ctx);
FAIL;
}
cuda_exit(PARAMS->ctx);
}
#section cleanup_code_struct
cuda_enter(ctx->ctx);
cudnnDestroy(APPLY_SPECIFIC(_handle));
cuda_exit(ctx->ctx);
Py_DECREF((PyObject *)ctx);
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, npy_float64 epsilon, PyGpuArrayObject *bias, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
PyGpuArrayObject **x_invstd, PyGpuContextObject *c) { PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(scale, bn_params) != 0) if (c_set_tensorNd(scale, bn_params) != 0)
...@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -37,7 +39,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardTraining( cudnnStatus_t err = cudnnBatchNormalizationForwardTraining(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alpha, alpha,
beta, beta,
......
...@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -24,7 +24,9 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
PyGpuArrayObject *scale, PyGpuArrayObject *x_mean, PyGpuArrayObject *scale, PyGpuArrayObject *x_mean,
PyGpuArrayObject *x_invstd, npy_float64 epsilon, PyGpuArrayObject *x_invstd, npy_float64 epsilon,
PyGpuArrayObject **dinp, PyGpuArrayObject **dscale, PyGpuArrayObject **dinp, PyGpuArrayObject **dscale,
PyGpuArrayObject **dbias, PyGpuContextObject *c) { PyGpuArrayObject **dbias, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(doutp, bn_doutput) != 0) if (c_set_tensorNd(doutp, bn_doutput) != 0)
...@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp, ...@@ -66,7 +68,7 @@ int dnn_batchnorm_grad(PyGpuArrayObject *inp, PyGpuArrayObject *doutp,
betaParam = (void *)&fbeta; betaParam = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationBackward( cudnnStatus_t err = cudnnBatchNormalizationBackward(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alphaData, alphaData,
betaData, betaData,
......
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
PyGpuArrayObject *bias, PyGpuArrayObject *est_mean, PyGpuArrayObject *bias, PyGpuArrayObject *est_mean,
PyGpuArrayObject *est_var, npy_float64 epsilon, PyGpuArrayObject *est_var, npy_float64 epsilon,
PyGpuArrayObject **outp, PyGpuContextObject *c) { PyGpuArrayObject **outp, cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
if (c_set_tensorNd(inp, bn_input) != 0) if (c_set_tensorNd(inp, bn_input) != 0)
return 1; return 1;
if (c_set_tensorNd(scale, bn_params) != 0) if (c_set_tensorNd(scale, bn_params) != 0)
...@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, ...@@ -33,7 +35,7 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
beta = (void *)&fbeta; beta = (void *)&fbeta;
} }
cudnnStatus_t err = cudnnBatchNormalizationForwardInference( cudnnStatus_t err = cudnnBatchNormalizationForwardInference(
APPLY_SPECIFIC(_handle), _handle,
MODE, MODE,
alpha, alpha,
beta, beta,
......
#section support_code
int dnn_dropout_desc(float dropout, unsigned long long seed,
PyGpuContextObject *c,
cudnnDropoutDescriptor_t *odesc,
PyGpuArrayObject **ostates,
cudnnHandle_t _handle) {
PyGpuArrayObject *states;
cudnnDropoutDescriptor_t desc;
size_t states_sz;
cudnnStatus_t err;
cuda_enter(c->ctx);
err = cudnnCreateDropoutDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create dropout descriptor");
cuda_exit(c->ctx);
return -1;
}
/* Can't fail according to docs */
cudnnDropoutGetStatesSize(_handle, &states_sz);
states = pygpu_empty(1, &states_sz, GA_UBYTE, GA_C_ORDER, c, Py_None);
if (states == NULL) {
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
err = cudnnSetDropoutDescriptor(desc, _handle, dropout,
PyGpuArray_DEV_DATA(states),
states_sz, seed);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't set dropout descriptor");
Py_DECREF((PyObject *)states);
cudnnDestroyDropoutDescriptor(desc);
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
*odesc = desc;
*ostates = states;
return 0;
}
#section support_code
int dnn_dropout_fwd(PyGpuArrayObject *x,
cudnnDropoutDescriptor_t *desc,
PyGpuArrayObject *state,
PyGpuArrayObject **y,
PyGpuArrayObject **ostate,
gpudata **reserve,
cudnnHandle_t _handle) {
PyGpuArrayContext *c = x->context;
cudnnTensorDescriptor_t xdesc;
cudnnTensorDescriptor_t ydesc;
gpudata *res;
size_t res_sz;
cudnnStatus_t err;
if (c_make_tensorNd(x, &xdesc))
return -1;
if (theano_prep_output(y, x->ga.nd, x->ga.dimensions, x->ga.typecode,
GA_C_ORDER, c)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
if (c_make_tensorNd(y, &ydesc)) {
cudnnDestroyTensorDescriptor(xdesc);
return -1;
}
*ostate = state;
Py_INCREF((PyObject *)state);
/* This can't fail according to the docs */
err = cudnnDropoutGetReserveSpaceSize(desc, &res_sz);
res = gpudata_alloc(c->ctx, res_zs, NULL, 0, NULL);
if (res == NULL) {
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
PyErr_SetString(PyExc_RuntimeError, "Could not allocate reserve for dropout");
}
*reserve = res;
cuda_enter(c->ctx);
err = cudnnDropoutForward(_handle, desc, xdesc, PyGpuArray_DEV_DATA(x),
ydesc, PyGpuArray_DEV_DATA(y), *(void **)res,
res_sz);
cudnnDestroyTensorDescriptor(xdesc);
cudnnDestroyTensorDescriptor(ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not run dropout: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return -1;
}
cuda_exit(c->ctx);
return 0;
}
...@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -26,11 +26,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject **output, PyGpuArrayObject **output,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = input->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -92,7 +93,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm( err = cudnnFindConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice); desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -115,7 +116,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -198,7 +199,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -211,7 +212,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// TODO: Print a warning // TODO: Print a warning
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
...@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -248,7 +249,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
......
...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input, double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = kerns->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice; cudnnConvolutionBwdDataAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardDataAlgorithm( err = cudnnFindConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), 1, &count, &choice); APPLY_SPECIFIC(input), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -116,7 +117,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -193,7 +194,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -218,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -25,11 +25,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns, double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; PyGpuContextObject *c = input->context;
float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -93,7 +94,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
err = cudnnFindConvolutionBackwardFilterAlgorithm( err = cudnnFindConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), 1, &count, &choice); APPLY_SPECIFIC(kerns), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -117,7 +118,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -181,7 +182,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace; gpudata *workspace;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -205,7 +206,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
APPLY_SPECIFIC(_handle), _handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output),
......
...@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -42,9 +42,10 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **out, PyGpuArrayObject **out,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
cudnnStatus_t err; PyGpuContextObject *c = img->context;
size_t dims[5]; size_t dims[5];
cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported."); PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
...@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -122,7 +123,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward( err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool), _handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta, beta,
......
...@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -64,7 +64,8 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyArrayObject *stride, PyArrayObject *stride,
PyArrayObject *pad, PyArrayObject *pad,
PyGpuArrayObject **inp_grad, PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = inp->context;
cudnnStatus_t err; cudnnStatus_t err;
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
...@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -153,7 +154,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward( err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(pool), _handle, APPLY_SPECIFIC(pool),
alpha, alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad), APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
......
#section support_code
int dnn_rnn_desc(int hidden_size, int num_layers,
cudnnDropoutDescriptor_t ddesc,
int input_mode, int direction_mode, int rnn_mode,
int dtype, cudnnRNNDescriptor_t *odesc,
cudnnHandle_t _handle) {
cudnnRNNDescriptor_t desc;
cudnnDataType_t data_type;
cudnnStatus_t err;
switch (dtype) {
case GA_FLOAT:
data_type = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
data_type = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
data_type = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateRNNDescriptor(&desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Can't create RNN descriptor");
return -1;
}
err = cudnnSetRNNDescriptor(desc, hidden_size, num_layers, ddesc,
(cudnnRNNInputMode_t)input_mode,
(cudnnDirectionMode_t)direction_mode,
(cudnnRNNMode_t)rnn_mode, data_type);
if (err != CUDNN_STATUS_SUCCESS) {
cudnnDestroyRNNDescriptor(desc);
PyErr_SetString(PyExc_RuntimeError, "Can't set RNN descriptor");
return -1;
}
*odesc = desc;
return 0;
}
#section support_code
int dnn_rnn_fwd(cudnnRNNDescriptor_t desc,
PyGpuArrayObject *w, PyGpuArrayObject *x,
PyGpuArrayObject *hx, PyGpuArrayObject *cx,
gpudata **reserve, PyGpuArrayObject **y,
PyGpuArrayObject **hy, PyGpuArrayObject **cy,
cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t hydesc = NULL;
cudnnTensorDescriptor_t cydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(x, 0);
size_t miniBatch = PyGpuArray_DIM(x, 1);
size_t inputSize = PyGpuArray_DIM(x, 2);
size_t hiddenSizeDir = PyGpuArray_DIM(hx, 2);
size_t shape[3];
int strs[3], dims[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[1] * dims[2];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = hiddenSizeDir;
if (theano_prep_output(y, 3, shape, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(hy, 3, PyGpuArray_DIMS(hx),
hx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*hy, &hydesc) != 0)
goto fail;
if (cy != NULL) {
if (theano_prep_output(cy, 3, PyGpuArray_DIMS(cx),
cx->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*cy, &cydesc) != 0)
goto fail;
}
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*reserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*reserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
err = cudnnRNNForwardTraining(_handle, desc, (int)seqLength,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
yl, PyGpuArray_DEV_DATA(*y),
hydesc, PyGpuArray_DEV_DATA(*hy),
cydesc, cy ? PyGpuArray_DEV_DATA(*cy) : NULL,
*(void **)workspace, worksize,
*(void **)(*reserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (hydesc != NULL)
cudnnDestroyTensorDescriptor(hydesc);
if (cydesc != NULL)
cudnnDestroyTensorDescriptor(cydesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gi(cudnnRNNDescriptor_t desc, npy_uint64 xshp,
PyGpuArrayObject *y, PyGpuArrayObject *dy,
PyGpuArrayObject *w, PyGpuArrayObject *hx,
gpudata *reserve, PyGpuArrayObject *cx,
PyGpuArrayObject *dhy, PyGpuArrayObject *dcy,
gpudata **oreserve, PyGpuArrayObject **dx,
PyGpuArrayObject **dhx, PyGpuArrayObject **dcx,
cudnnHandle_t _handle) {
PyGpuContextObject *c = y->context;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnTensorDescriptor_t dhydesc = NULL;
cudnnTensorDescriptor_t dcydesc = NULL;
cudnnFilterDescriptor_t wdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t cxdesc = NULL;
cudnnTensorDescriptor_t dxdesc = NULL;
cudnnTensorDescriptor_t dhxdesc = NULL;
cudnnTensorDescriptor_t dcxdesc = NULL;
cudnnTensorDescriptor_t *yl = NULL;
cudnnTensorDescriptor_t *dxl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t seqLength = PyGpuArray_DIM(y, 0);
size_t miniBatch = PyGpuArray_DIM(y, 1);
size_t inputSize = xshp;
size_t shape[3];
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (y->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for y");
return -1;
}
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (dhy != NULL)
if (c_make_tensorNd(dhy, &dhydesc) != 0)
goto fail;
if (dcy != NULL)
if (c_make_tensorNd(dcy, &dcydesc) != 0)
goto fail;
if (c_make_filter(w, &wdesc) != 0)
goto fail;
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
if (cx != NULL)
if (c_make_tensorNd(cx, &cxdesc) != 0)
goto fail;
shape[0] = seqLength;
shape[1] = miniBatch;
shape[2] = inputSize;
if (theano_prep_output(dx, 3, shape, y->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&dxdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = shape[1];
dims[1] = shape[2];
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(dxdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set dxdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dhx, 3, PyGpuArray_DIMS(hx), hx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dhx, &dhxdesc) != 0)
goto fail;
if (cx != NULL) {
if (theano_prep_output(dcx, 3, PyGpuArray_DIMS(cx), cx->ga.typecode,
GA_C_ORDER, c) != 0)
goto fail;
if (c_make_tensorNd(*dcx, &dcxdesc) != 0)
goto fail;
}
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
yl[i] = ydesc;
dxl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), seqLength);
if (dxl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < seqLength; i++)
dxl[i] = dxdesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)seqLength, dxl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)seqLength,
dxl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
*oreserve = gpudata_alloc(c->ctx, ressize, NULL, 0, NULL);
if (*oreserve == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate reserve");
goto fail;
}
if (gpudata_move(*oreserve, 0, reserve, 0, ressize) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "could not copy reserve");
goto fail;
}
err = cudnnRNNBackwardData(_handle, desc, (int)seqLength,
yl, PyGpuArray_DEV_DATA(y),
/* y and dy are the same shape */
yl, PyGpuArray_DEV_DATA(dy),
dhydesc, dhy ? PyGpuArray_DEV_DATA(dhy) : NULL,
dcydesc, dcy ? PyGpuArray_DEV_DATA(dcy) : NULL,
wdesc, PyGpuArray_DEV_DATA(w),
hxdesc, PyGpuArray_DEV_DATA(hx),
cxdesc, cx ? PyGpuArray_DEV_DATA(cx) : NULL,
dxl, PyGpuArray_DEV_DATA(*dx),
dhxdesc, PyGpuArray_DEV_DATA(*dhx),
dcxdesc, dcx ? PyGpuArray_DEV_DATA(*dcx) : NULL,
*(void **)workspace, worksize,
*(void **)(*oreserve), ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad inputs: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dhydesc != NULL)
cudnnDestroyTensorDescriptor(dhydesc);
if (dcydesc != NULL)
cudnnDestroyTensorDescriptor(dcydesc);
if (wdesc != NULL)
cudnnDestroyFilterDescriptor(wdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (cxdesc != NULL)
cudnnDestroyTensorDescriptor(cxdesc);
if (dxdesc != NULL)
cudnnDestroyTensorDescriptor(dxdesc);
if (dhxdesc != NULL)
cudnnDestroyTensorDescriptor(dhxdesc);
if (dcxdesc != NULL)
cudnnDestroyTensorDescriptor(dcxdesc);
free(yl);
free(dxl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_gw(cudnnRNNDescriptor_t desc, npy_uint64 _wsize,
PyGpuArrayObject *x, PyGpuArrayObject *hx,
PyGpuArrayObject *y, gpudata *reserve,
PyGpuArrayObject **dw, cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnTensorDescriptor_t xdesc = NULL;
cudnnTensorDescriptor_t hxdesc = NULL;
cudnnTensorDescriptor_t ydesc = NULL;
cudnnFilterDescriptor_t dwdesc = NULL;
cudnnTensorDescriptor_t *xl = NULL;
cudnnTensorDescriptor_t *yl = NULL;
gpudata *workspace = NULL;
size_t worksize, ressize;
size_t iters = PyGpuArray_DIM(x, 0);
size_t wsize = _wsize;
int dims[3], strs[3];
cudnnStatus_t err;
cudnnDataType_t dt;
int res = -1;
switch (x->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_TypeError, "Unsupported data type for x");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* We need to use the last two dimensions for this, this is not a typo */
dims[0] = PyGpuArray_DIM(x, 1);
dims[1] = PyGpuArray_DIM(x, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set xdesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (c_make_tensorNd(hx, &hxdesc) != 0)
goto fail;
err = cudnnCreateTensorDescriptor(&ydesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not create ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
/* Again not a typo, we need to use the last two dimensions */
dims[0] = PyGpuArray_DIM(y, 1);
dims[1] = PyGpuArray_DIM(y, 2);
dims[2] = 1;
strs[0] = dims[2] * dims[1];
strs[1] = dims[2];
strs[2] = 1;
err = cudnnSetTensorNdDescriptor(ydesc, dt, 3, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set ydesc: %s",
cudnnGetErrorString(err));
goto fail;
}
if (theano_prep_output(dw, 1, &wsize, x->ga.typecode, GA_C_ORDER, c) != 0)
goto fail;
GpuArray_memset(&(*dw)->ga, 0);
if (c_make_filter(*dw, &dwdesc) != 0)
goto fail;
xl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (xl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
xl[i] = xdesc;
yl = (cudnnTensorDescriptor_t *)calloc(sizeof(cudnnTensorDescriptor_t), iters);
if (yl == NULL) {
PyErr_NoMemory();
goto fail;
}
for (size_t i = 0; i < iters; i++)
yl[i] = ydesc;
err = cudnnGetRNNWorkspaceSize(_handle, desc, (int)iters,
xl, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get worksize: %s",
cudnnGetErrorString(err));
goto fail;
}
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_Format(PyExc_RuntimeError, "Could not allocate workspace");
goto fail;
}
err = cudnnGetRNNTrainingReserveSize(_handle, desc, (int)iters,
xl, &ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not get reserve size: %s",
cudnnGetErrorString(err));
goto fail;
}
err = cudnnRNNBackwardWeights(_handle, desc, (int)iters,
xl, PyGpuArray_DEV_DATA(x),
hxdesc, PyGpuArray_DEV_DATA(hx),
yl, PyGpuArray_DEV_DATA(y),
*(void **)workspace, worksize,
dwdesc, PyGpuArray_DEV_DATA(*dw),
*(void **)reserve, ressize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could run RNN grad weights: %s",
cudnnGetErrorString(err));
goto fail;
}
res = 0;
fail:
if (xdesc != NULL)
cudnnDestroyTensorDescriptor(xdesc);
if (hxdesc != NULL)
cudnnDestroyTensorDescriptor(hxdesc);
if (ydesc != NULL)
cudnnDestroyTensorDescriptor(ydesc);
if (dwdesc != NULL)
cudnnDestroyFilterDescriptor(dwdesc);
free(xl);
free(yl);
if (workspace != NULL)
gpudata_release(workspace);
cuda_exit(c->ctx);
return res;
}
#section support_code
int dnn_rnn_paramsize(cudnnRNNDescriptor_t desc,
PyArrayObject *isize,
npy_int32 typecode,
npy_uint64 *oparam_size,
cudnnHandle_t _handle) {
cudnnTensorDescriptor_t xdesc;
size_t param_size;
cudnnStatus_t err;
cudnnDataType_t dt;
int shape[3];
int strides[3];
if (PyArray_DIM(isize, 0) != 2) {
PyErr_SetString(PyExc_ValueError, "input_size should be of length two");
return -1;
}
switch (typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
default:
PyErr_SetString(PyExc_ValueError, "Unsupported data type");
return -1;
}
err = cudnnCreateTensorDescriptor(&xdesc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not create tensor descriptor");
return -1;
}
shape[0] = *(npy_uint64 *)PyArray_GETPTR1(isize, 0);
shape[1] = *(npy_uint64 *)PyArray_GETPTR1(isize, 1);
shape[2] = 1;
strides[0] = shape[2] * shape[1];
strides[1] = shape[2];
strides[2] = 1;
err = cudnnSetTensorNdDescriptor(xdesc, dt, 3, shape, strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "Could not set tensor descriptor: %s",
cudnnGetErrorString(err));
return -1;
}
err = cudnnGetRNNParamsSize(_handle, desc, xdesc, &param_size, dt);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "Could not get parameter size");
return -1;
}
cudnnDestroyTensorDescriptor(xdesc);
*oparam_size = param_size;
return 0;
}
...@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL) ...@@ -35,7 +35,8 @@ if (APPLY_SPECIFIC(output) != NULL)
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out, PyGpuArrayObject **out,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = x->context;
cudnnStatus_t err; cudnnStatus_t err;
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0) if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
...@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, ...@@ -77,7 +78,7 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward( err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle), _handle,
SOFTMAX_ALGO, SOFTMAX_ALGO,
SOFTMAX_MODE, SOFTMAX_MODE,
alpha, alpha,
......
...@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL) ...@@ -46,7 +46,8 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm, PyGpuArrayObject *sm,
PyGpuArrayObject **dx, PyGpuArrayObject **dx,
PyGpuContextObject *c) { cudnnHandle_t _handle) {
PyGpuContextObject *c = dy->context;
cudnnStatus_t err; cudnnStatus_t err;
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0) if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
...@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, ...@@ -91,7 +92,7 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward( err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle), _handle,
SOFTMAX_ALGO, SOFTMAX_ALGO,
SOFTMAX_MODE, SOFTMAX_MODE,
alpha, alpha,
......
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor as T
import numpy
class Model(object):
def __init__(self, name=""):
self.name = name
self.layers = []
self.params = []
self.other_updates = {}
def add_layer(self, layer):
self.layers.append(layer)
for p in layer.params:
self.params.append(p)
if hasattr(layer, 'other_updates'):
for y in layer.other_updates:
self.other_updates[y[0]] = y[1]
def get_params(self):
return self.params
def uniform(stdev, size):
"""uniform distribution with the given stdev and size"""
return numpy.random.uniform(
low=-stdev * numpy.sqrt(3),
high=stdev * numpy.sqrt(3),
size=size
).astype(theano.config.floatX)
def linear_transform_weights(input_dim, output_dim,
param_list=None, name=""):
"theano shared variable given input and output dimension"
weight_inialization = uniform(numpy.sqrt(2.0 / input_dim),
(input_dim, output_dim))
W = theano.shared(weight_inialization, name=name)
assert(param_list is not None)
param_list.append(W)
return W
def bias_weights(length, param_list=None, name=""):
"theano shared variable for bias unit, given length"
bias_initialization = numpy.zeros(length).astype(theano.config.floatX)
bias = theano.shared(
bias_initialization,
name=name
)
if param_list is not None:
param_list.append(bias)
return bias
class Layer(object):
'''Generic Layer Template which all layers should inherit'''
def __init__(self, name=""):
self.name = name
self.params = []
def get_params(self):
return self.params
class GRU(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_r = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_r")
self.b_wr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wr")
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_h = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_h")
self.b_wh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wh")
self.R_r = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_r")
self.b_rr = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rr")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ru = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ru")
self.R_h = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_h")
self.b_rh = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rh")
'''step through processed input to create output'''
def step(inp, s_prev):
i_t = T.nnet.sigmoid(
T.dot(inp, self.W_i) + T.dot(s_prev, self.R_i) + self.b_wi + self.b_ru)
r_t = T.nnet.sigmoid(
T.dot(inp, self.W_r) + T.dot(s_prev, self.R_r) + self.b_wr + self.b_rr)
h_hat_t = T.tanh(
T.dot(inp, self.W_h) + (r_t * (T.dot(s_prev, self.R_h) + self.b_rh)) + self.b_wh)
s_curr = ((1.0 - i_t) * h_hat_t) + (i_t * s_prev)
return s_curr
outputs_info = self.s0
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states
def output(self):
return self.Y
class LSTM(Layer):
def __init__(self, input_dim, output_dim, input_layer, s0=None, c0=None,
name=""):
'''Layers information'''
self.name = name
self.input_dim = input_dim
self.hidden_dim = output_dim
self.output_dim = output_dim
self.input_layer = input_layer
self.X = input_layer.output()
self.s0 = s0
self.c0 = c0
self.params = []
'''Layers weights'''
'''self.params is passed so that any paramters could be appended to it'''
self.W_i = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_i")
self.b_wi = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wi")
self.W_f = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_f")
self.b_wf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wf")
self.W_c = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_c")
self.b_wc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wc")
self.W_o = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W_o")
self.b_wo = bias_weights((output_dim,), param_list=self.params, name=name + ".b_wo")
self.R_i = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_i")
self.b_ri = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ri")
self.R_f = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_f")
self.b_rf = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rf")
self.R_c = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_c")
self.b_rc = bias_weights((output_dim,), param_list=self.params, name=name + ".b_rc")
self.R_o = linear_transform_weights(output_dim, output_dim, param_list=self.params, name=name + ".R_o")
self.b_ro = bias_weights((output_dim,), param_list=self.params, name=name + ".b_ro")
'''step through processed input to create output'''
def step(x_t, h_tm1, c_tm1):
i_t = T.nnet.sigmoid(
T.dot(x_t, self.W_i) + T.dot(h_tm1, self.R_i) + self.b_wi + self.b_ri)
f_t = T.nnet.sigmoid(
T.dot(x_t, self.W_f) + T.dot(h_tm1, self.R_f) + self.b_wf + self.b_rf)
o_t = T.nnet.sigmoid(
T.dot(x_t, self.W_o) + T.dot(h_tm1, self.R_o) + self.b_ro + self.b_wo)
c_hat_t = T.tanh(
T.dot(x_t, self.W_c) + T.dot(h_tm1, self.R_c) + self.b_wc + self.b_rc)
c_t = f_t * c_tm1 + i_t * c_hat_t
h_t = o_t * T.tanh(c_t)
return h_t, c_t
outputs_info = [self.s0, self.c0]
states, updates = theano.scan(
fn=step,
sequences=[self.X],
outputs_info=outputs_info
)
self.Y = states[0]
self.C = states[1]
def output(self):
return self.Y
class FC(Layer):
def __init__(self, input_dim, output_dim, input_layer, name=""):
self.input_layer = input_layer
self.name = name
self.params = []
self.input_dim = input_dim
self.output_dim = output_dim
self.X = self.input_layer.output()
self.W = linear_transform_weights(input_dim, output_dim, param_list=self.params, name=name + ".W")
self.b = bias_weights((output_dim,), param_list=self.params, name=name + ".b")
def output(self):
return T.dot(self.X, self.W) + self.b
class WrapperLayer(Layer):
def __init__(self, X, name=""):
self.params = []
self.name = name
self.X = X
def output(self):
return self.X
...@@ -19,6 +19,7 @@ from ..type import gpuarray_shared_constructor ...@@ -19,6 +19,7 @@ from ..type import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from . import test_nnet from . import test_nnet
from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
...@@ -1434,3 +1435,249 @@ def test_batchnorm_inference(): ...@@ -1434,3 +1435,249 @@ def test_batchnorm_inference():
utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias
utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean
utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_dnn_rnn_gru():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'gru')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
gru = GRU(last_dim, hidden_dim, last_layer, s0=h0[i, :, :])
model.add_layer(gru)
last_layer = gru
last_dim = hidden_dim
layer_params = gru.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params, hy=None):
cost = 0
if out:
cost += T.mean((Y - out)**2)
if hy:
cost += T.mean(hy**2)
grad = T.grad(cost, [X, h0] + params)
grad_fn = theano.function([X, Y, h0], grad, mode=mode_with_gpu,
on_unused_input='ignore')
return grad_fn
ref_y = last_layer.output()
# This will grab the hy from the scan implementation
ref_hy = T.stack([model.layers[0].Y[-1],
model.layers[1].Y[-1],
model.layers[2].Y[-1]])
y, hy = rnnb.apply(params_cudnn, X, h0)
ref_fn = theano.function([X, h0], ref_y, mode=mode_with_gpu)
cudnn_fn = theano.function([X, h0], y, mode=mode_with_gpu)
# Test with grad connected to y
ref_grad_fn = funcs(ref_y, model.get_params())
cudnn_grad_fn = funcs(y, [params_cudnn])
# Test with grad connected to both y and hy
ref2_grad_fn = funcs(ref_y, model.get_params(), ref_hy)
cudnn2_grad_fn = funcs(y, [params_cudnn], hy)
# Test with grad connected to hy
ref3_grad_fn = funcs(None, model.get_params(), ref_hy)
cudnn3_grad_fn = funcs(None, [params_cudnn], hy)
ref_grad_fns = [ref_grad_fn, ref2_grad_fn, ref3_grad_fn]
cudnn_grad_fns = [cudnn_grad_fn, cudnn2_grad_fn, cudnn3_grad_fn]
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_out = ref_fn(x_val, h0_val)
cudnn_out = cudnn_fn(x_val, h0_val)
utt.assert_allclose(ref_out, cudnn_out)
for ref_grad_fn, cudnn_grad_fn in zip(ref_grad_fns, cudnn_grad_fns):
ref_grads = ref_grad_fn(x_val, y_val, h0_val)
cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
ref_grad_params = ref_grads[2:]
cudnn_grad_params = gpuarray_shared_constructor(cudnn_grads[2])
for i in range(depth):
cudnn_grad_layer = rnnb.split_params(cudnn_grad_params, i,
[batch_size, input_dim])
ref_grad_layer = ref_grad_params[i * len(cudnn_grad_layer):
(i + 1) * len(cudnn_grad_layer)]
for j, g in enumerate(cudnn_grad_layer):
utt.assert_allclose(ref_grad_layer[j], g)
def test_dnn_rnn_lstm():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
model.add_layer(lstm)
last_layer = lstm
last_dim = hidden_dim
layer_params = lstm.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params):
fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
cost = T.mean((Y - out)**2)
grad = T.grad(cost, [X, h0, c0] + params)
grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
return fn, grad_fn
ref_fn, ref_grad_fn = funcs(last_layer.output(),
model.get_params())
cudnn_fn, cudnn_grad_fn = funcs(rnnb.apply(params_cudnn, X, h0, c0)[0],
[params_cudnn])
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
y_val = numpy.random.random((timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_out = ref_fn(x_val, h0_val, c0_val)
cudnn_out = cudnn_fn(x_val, h0_val, c0_val)
utt.assert_allclose(ref_out, cudnn_out)
ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
utt.assert_allclose(ref_grads[2], cudnn_grads[2])
ref_grads_params = ref_grads[3:]
cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
for i in range(depth):
cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
[batch_size, input_dim])
ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
(i + 1) * len(cudnn_grads_layer)]
for j, g in enumerate(cudnn_grads_layer):
utt.assert_allclose(ref_grads_layer[j], g)
def test_dnn_rnn_lstm_grad_c():
# test params
input_dim = 32
hidden_dim = 16
batch_size = 2
depth = 3
timesteps = 5
# test code
X = T.tensor3('X')
CY = T.tensor3('CY')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')
rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
psize = rnnb.get_param_size([batch_size, input_dim])
params_cudnn = gpuarray_shared_constructor(
numpy.zeros((psize,), dtype=theano.config.floatX))
model = Model()
last_layer = WrapperLayer(X)
last_dim = input_dim
for i in range(depth):
lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :])
model.add_layer(lstm)
last_layer = lstm
last_dim = hidden_dim
layer_params = lstm.get_params()
dnn_params = rnnb.split_params(params_cudnn, i,
[batch_size, input_dim])
for j, p in enumerate(dnn_params):
p[:] = layer_params[j].get_value(borrow=True,
return_internal_type=True)
def funcs(out, params):
cost = T.mean((CY - out)**2)
grad = T.grad(cost, [X, h0, c0] + params)
grad_fn = theano.function([X, CY, h0, c0], grad, mode=mode_with_gpu)
return grad_fn
_, _, cy = rnnb.apply(params_cudnn, X, h0, c0)
ref_cy = T.stack([model.layers[0].C[-1],
model.layers[1].C[-1],
model.layers[2].C[-1]])
ref_grad_fn = funcs(ref_cy, model.get_params())
cudnn_grad_fn = funcs(cy, [params_cudnn])
x_val = numpy.random.random((timesteps, batch_size, input_dim)).astype(theano.config.floatX)
cy_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
h0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
c0_val = numpy.random.random((depth, batch_size, hidden_dim)).astype(theano.config.floatX)
ref_grads = ref_grad_fn(x_val, cy_val, h0_val, c0_val)
cudnn_grads = cudnn_grad_fn(x_val, cy_val, h0_val, c0_val)
utt.assert_allclose(ref_grads[0], cudnn_grads[0])
utt.assert_allclose(ref_grads[1], cudnn_grads[1])
utt.assert_allclose(ref_grads[2], cudnn_grads[2])
ref_grads_params = ref_grads[3:]
cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])
for i in range(depth):
cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
[batch_size, input_dim])
ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):
(i + 1) * len(cudnn_grads_layer)]
for j, g in enumerate(cudnn_grads_layer):
utt.assert_allclose(ref_grads_layer[j], g)
...@@ -68,6 +68,7 @@ def reg_context(name, ctx): ...@@ -68,6 +68,7 @@ def reg_context(name, ctx):
if not isinstance(ctx, gpuarray.GpuContext): if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext") raise TypeError("context is not GpuContext")
_context_reg[name] = ctx _context_reg[name] = ctx
_props_map[ctx] = dict()
def get_context(name): def get_context(name):
...@@ -96,6 +97,26 @@ def list_contexts(): ...@@ -96,6 +97,26 @@ def list_contexts():
""" """
return _context_reg.keys() return _context_reg.keys()
# Mappings of properties to contexts. Please never use this if you
# can avoid it.
# This is basically a way to store "global" variables that depend on
# the context.
_props_map = {}
def _get_props(name):
ctx = get_context(name)
return _props_map[ctx]
def get_prop(name, k):
return _get_props(name)[k]
def set_prop(name, k, v):
_get_props(name)[k] = v
# Private method # Private method
def _name_for_ctx(ctx): def _name_for_ctx(ctx):
......
...@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx, ...@@ -1102,7 +1102,8 @@ def _populate_grad_dict(var_to_app_to_idx,
str(o_shape) + " on an output of shape " + str(o_shape) + " on an output of shape " +
str(g_shape)) str(g_shape))
input_grads = node.op.grad(inputs, new_output_grads) input_grads = node.op.L_op(inputs, node.outputs,
new_output_grads)
if input_grads is None: if input_grads is None:
raise TypeError("%s.grad returned NoneType, " raise TypeError("%s.grad returned NoneType, "
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论