提交 3a59bd8c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Major adaptation to handle explicit context activation.

上级 babe6f1b
...@@ -44,7 +44,7 @@ def dnn_available(): ...@@ -44,7 +44,7 @@ def dnn_available():
return False return False
# This is a hack because bin_id is in the from of # This is a hack because bin_id is in the from of
# "sm_<major><minor>" for cuda devices. # "sm_<major><minor>" for cuda devices.
if pygpu.get_default_context().bin_id < 'sm_30': if pygpu.get_default_context().bin_id[:-2] < '30':
dnn_available.msg = "Device not supported by cuDNN" dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False dnn_available.avail = False
preambule = """ preambule = """
...@@ -81,7 +81,13 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -81,7 +81,13 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
str(err)) str(err))
else: else:
# If we can compile, check that we can import and run. # If we can compile, check that we can import and run.
if version() == 20: v = version()
if v == -1:
dnn_available.avail = False
dnn_available.msg = (
"You have CuDNN v1 installed, upgrade to v2 or more recent.")
raise RuntimeError(dnn_available.msg)
if v == 20:
dnn_available.avail = False dnn_available.avail = False
dnn_available.msg = ( dnn_available.msg = (
"You have installed a release candidate of CuDNN v2." "You have installed a release candidate of CuDNN v2."
...@@ -90,53 +96,10 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -90,53 +96,10 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
raise RuntimeError(dnn_available.msg) raise RuntimeError(dnn_available.msg)
return dnn_available.avail return dnn_available.avail
dnn_available.avail = None dnn_available.avail = None
dnn_available.msg = None dnn_available.msg = None
def c_set_tensor4d(var, desc, err, fail):
return """
{
cudnnDataType_t dt;
size_t ds;
switch (%(var)s->ga.typecode) {
case GA_FLOAT:
dt = CUDNN_DATA_FLOAT;
break;
case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE;
break;
default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d");
return -1;
}
ds = gpuarray_get_elsize(%(var)s->ga.typecode);
int str0, str1, str2, str3;
// cudnn do not like 0s in strides
str3 = PyGpuArray_STRIDES(%(var)s)[3]?PyGpuArray_STRIDES(%(var)s)[3]/ds:1;
str2 = PyGpuArray_STRIDES(%(var)s)[2]?PyGpuArray_STRIDES(%(var)s)[2]/ds:PyGpuArray_DIMS(%(var)s)[3];
str1 = PyGpuArray_STRIDES(%(var)s)[1]?PyGpuArray_STRIDES(%(var)s)[1]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3];
str0 = PyGpuArray_STRIDES(%(var)s)[0]?PyGpuArray_STRIDES(%(var)s)[0]/ds:PyGpuArray_DIMS(%(var)s)[2]*PyGpuArray_DIMS(%(var)s)[3]*PyGpuArray_DIMS(%(var)s)[1];
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, dt,
PyGpuArray_DIMS(%(var)s)[0],
PyGpuArray_DIMS(%(var)s)[1],
PyGpuArray_DIMS(%(var)s)[2],
PyGpuArray_DIMS(%(var)s)[3],
str0, str1, str2, str3);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"could not set tensor4d descriptor: %%s",
cudnnGetErrorString(%(err)s));
%(fail)s
}
}
""" % dict(var=var, err=err, desc=desc, fail=fail)
class DnnBase(COp): class DnnBase(COp):
""" """
Creates a handle for cudnn and pulls in the cudnn libraries and headers. Creates a handle for cudnn and pulls in the cudnn libraries and headers.
...@@ -146,13 +109,15 @@ class DnnBase(COp): ...@@ -146,13 +109,15 @@ class DnnBase(COp):
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
def __init__(self): def __init__(self, files=None, c_func=None):
COp.__init__(self, "dnn_base.c") if files is None:
files = []
COp.__init__(self, ["dnn_base.c"] + files, c_func)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h', return ['cudnn.h', 'cudnn_helper.h', 'gpuarray_helper.h',
'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h', 'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/util.h',
'gpuarray_api.h', 'numpy_compat.h'] 'gpuarray/ext_cuda.h', 'gpuarray_api.h', 'numpy_compat.h']
def c_header_dirs(self): def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include(), return [os.path.dirname(__file__), pygpu.get_include(),
...@@ -164,6 +129,9 @@ class DnnBase(COp): ...@@ -164,6 +129,9 @@ class DnnBase(COp):
def c_lib_dirs(self): def c_lib_dirs(self):
return [config.dnn.library_path] return [config.dnn.library_path]
def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version())
class DnnVersion(Op): class DnnVersion(Op):
__props__ = () __props__ = ()
...@@ -320,6 +288,9 @@ class GpuDnnConvDesc(COp): ...@@ -320,6 +288,9 @@ class GpuDnnConvDesc(COp):
('CONV_MODE', conv_flag), ('CONV_MODE', conv_flag),
('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)] ('SUB_0', sub0), ('SUB_1', sub1), ('SUB_2', sub2)]
def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float64')) _zero = constant(numpy.asarray(0.0, dtype='float64'))
_one = constant(numpy.asarray(1.0, dtype='float64')) _one = constant(numpy.asarray(1.0, dtype='float64'))
...@@ -339,7 +310,7 @@ def ensure_dt(val, default, name, dtype): ...@@ -339,7 +310,7 @@ def ensure_dt(val, default, name, dtype):
return val return val
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase):
""" """
The forward convolution. The forward convolution.
...@@ -357,8 +328,8 @@ class GpuDnnConv(DnnBase, COp): ...@@ -357,8 +328,8 @@ class GpuDnnConv(DnnBase, COp):
__props__ = ('algo', 'inplace') __props__ = ('algo', 'inplace')
def __init__(self, algo=None, inplace=False): def __init__(self, algo=None, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
if algo is None: if algo is None:
algo = config.dnn.conv.algo_fwd algo = config.dnn.conv.algo_fwd
...@@ -521,7 +492,7 @@ class GpuDnnConv(DnnBase, COp): ...@@ -521,7 +492,7 @@ class GpuDnnConv(DnnBase, COp):
return [shape[2]] return [shape[2]]
class GpuDnnConvGradW(DnnBase, COp): class GpuDnnConvGradW(DnnBase):
""" """
The convolution gradient with respect to the weights. The convolution gradient with respect to the weights.
...@@ -537,7 +508,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -537,7 +508,7 @@ class GpuDnnConvGradW(DnnBase, COp):
__props__ = ('algo', 'inplace') __props__ = ('algo', 'inplace')
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
...@@ -652,8 +623,8 @@ class GpuDnnConvGradI(DnnBase): ...@@ -652,8 +623,8 @@ class GpuDnnConvGradI(DnnBase):
__props__ = ('algo', 'inplace',) __props__ = ('algo', 'inplace',)
def __init__(self, inplace=False, algo=None): def __init__(self, inplace=False, algo=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"], DnnBase.__init__(self, ["dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
...@@ -968,6 +939,9 @@ class GpuDnnPool(DnnBase): ...@@ -968,6 +939,9 @@ class GpuDnnPool(DnnBase):
__props__ = () __props__ = ()
def __init__(self):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
def make_node(self, img, desc): def make_node(self, img, desc):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
...@@ -995,102 +969,6 @@ class GpuDnnPool(DnnBase): ...@@ -995,102 +969,6 @@ class GpuDnnPool(DnnBase):
res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1) res.append((shape[0][4] + 2 * p[2] - w[2]) // s[2] + 1)
return [res] return [res]
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t output%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
output%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %%s", cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[1]
out, = outputs
return """
cudnnStatus_t err%(name)s;
size_t %(out)s_dims[5];
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
%(fail)s
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims;
err%(name)s = cudnnGetPoolingNdDescriptor(%(desc)s, 3, &mode, &ndims, w, p, s);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
%(out)s_dims[0] = PyGpuArray_DIM(%(input)s, 0);
%(out)s_dims[1] = PyGpuArray_DIM(%(input)s, 1);
%(out)s_dims[2] = (PyGpuArray_DIM(%(input)s, 2) + (p[0]*2) - w[0]) / s[0] + 1;
%(out)s_dims[3] = (PyGpuArray_DIM(%(input)s, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
%(out)s_dims[4] = (PyGpuArray_DIM(%(input)s, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (theano_prep_output(&%(out)s, ndims+2, %(out)s_dims, %(input)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
%(fail)s
}
if (c_set_tensorNd(%(out)s, %(output_desc)s) != 0)
%(fail)s
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingForward(
_handle, %(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
&beta,
%(output_desc)s, PyGpuArray_DEV_DATA(%(out)s));
}
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(out=out, desc=desc, fail=sub['fail'],
name=name, input=inputs[0],
input_desc="input" + name,
output_desc="output" + name)
def grad(self, inp, grads): def grad(self, inp, grads):
img, desc = inp img, desc = inp
grad, = grads grad, = grads
...@@ -1107,9 +985,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -1107,9 +985,6 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
# not connected to desc # not connected to desc
return [[1], [0]] return [[1], [0]]
def c_code_cache_version(self):
return (8, version())
class GpuDnnPoolGrad(DnnBase): class GpuDnnPoolGrad(DnnBase):
""" """
...@@ -1130,16 +1005,20 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1130,16 +1005,20 @@ class GpuDnnPoolGrad(DnnBase):
__props__ = () __props__ = ()
def make_node(self, inp, out, inp_grad, desc): def __init__(self):
DnnBase.__init__(self, ["dnn_pool_grad.c"],
"APPLY_SPECIFIC(dnn_pool_grad)")
def make_node(self, inp, out, out_grad, desc):
nd = desc.owner.op.get_ndim() + 2 nd = desc.owner.op.get_ndim() + 2
inp = as_gpuarray_variable(inp) inp = as_gpuarray_variable(inp)
if inp.type.ndim != nd: if inp.type.ndim != nd:
raise TypeError('inp must be %dD tensor' % (nd,)) raise TypeError('inp must be %dD tensor' % (nd,))
inp_grad = as_gpuarray_variable(inp_grad) out_grad = as_gpuarray_variable(out_grad)
if inp_grad.type.ndim != nd: if out_grad.type.ndim != nd:
raise TypeError('inp_grad must be %dD tensor' % (nd,)) raise TypeError('out_grad must be %dD tensor' % (nd,))
out = as_gpuarray_variable(out) out = as_gpuarray_variable(out)
if out.type.ndim != nd: if out.type.ndim != nd:
...@@ -1149,126 +1028,7 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1149,126 +1028,7 @@ class GpuDnnPoolGrad(DnnBase):
desc.type.ctype != 'cudnnPoolingDescriptor_t'): desc.type.ctype != 'cudnnPoolingDescriptor_t'):
raise TypeError('desc must be cudnnPoolingDescriptor_t') raise TypeError('desc must be cudnnPoolingDescriptor_t')
return Apply(self, [inp, out, inp_grad, desc], [inp.type()]) return Apply(self, [inp, out, out_grad, desc], [inp.type()])
def c_support_code_struct(self, node, name):
return """
cudnnTensorDescriptor_t input%(name)s;
cudnnTensorDescriptor_t input_grad%(name)s;
cudnnTensorDescriptor_t output%(name)s;
cudnnTensorDescriptor_t output_grad%(name)s;
""" % dict(name=name)
def c_init_code_struct(self, node, name, sub):
return """
cudnnStatus_t err%(name)s;
input%(name)s = NULL;
input_grad%(name)s = NULL;
output%(name)s = NULL;
output_grad%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&input%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&input_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if ((err%(name)s = cudnnCreateTensorDescriptor(&output_grad%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
def c_cleanup_code_struct(self, node, name):
return """
if (input%(name)s != NULL) { cudnnDestroyTensorDescriptor(input%(name)s); }
if (input_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(input_grad%(name)s); }
if (output%(name)s != NULL) { cudnnDestroyTensorDescriptor(output%(name)s); }
if (output_grad%(name)s != NULL) { cudnnDestroyTensorDescriptor(output_grad%(name)s); }
""" % dict(name=name)
def c_code(self, node, name, inputs, outputs, sub):
# Here the name out and inp are based on the cudnn definition.
# Not the definition of this class.
# This make it complicated.
out, inp, inp_grad, desc = inputs
out_grad, = outputs
return """
cudnnStatus_t err%(name)s;
if (!GpuArray_IS_C_CONTIGUOUS(&%(input)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(input_grad)s->ga)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous input gradients are supported.");
%(fail)s
}
if (!GpuArray_IS_C_CONTIGUOUS(&%(output)s->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
%(fail)s
}
if (c_set_tensorNd(%(input)s, %(input_desc)s) != 0)
%(fail)s
if (c_set_tensorNd(%(input_grad)s, %(input_grad_desc)s) != 0)
%(fail)s
if (c_set_tensorNd(%(output)s, %(output_desc)s) != 0)
%(fail)s
if (theano_prep_output(&%(output_grad)s, PyGpuArray_NDIM(%(output)s),
PyGpuArray_DIMS(%(output)s), %(output)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
%(fail)s
}
if (c_set_tensorNd(%(output_grad)s, %(output_grad_desc)s) != 0)
%(fail)s
{
const float alpha = 1;
const float beta = 0;
err%(name)s = cudnnPoolingBackward(
_handle, %(desc)s,
&alpha,
%(input_desc)s, PyGpuArray_DEV_DATA(%(input)s),
%(input_grad_desc)s, PyGpuArray_DEV_DATA(%(input_grad)s),
%(output_desc)s, PyGpuArray_DEV_DATA(%(output)s),
&beta,
%(output_grad_desc)s, PyGpuArray_DEV_DATA(%(output_grad)s)
);
}
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s.",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(output_grad=out_grad, desc=desc, fail=sub['fail'],
name=name, input=inp, input_grad=inp_grad, output=out,
input_desc="input" + name,
input_grad_desc="input_grad" + name,
output_desc="output" + name,
output_grad_desc="output_grad" + name)
def c_code_cache_version(self):
return (6, version())
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -1330,7 +1090,7 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1330,7 +1090,7 @@ class GpuDnnSoftmaxBase(DnnBase):
__props__ = ('mode', 'algo') __props__ = ('mode', 'algo')
def __init__(self, _, algo, mode): def __init__(self, _, algo, mode):
DnnBase.__init__(self) DnnBase.__init__(self, [self.file], self.c_func)
assert(algo in ('fast', 'accurate', 'log')) assert(algo in ('fast', 'accurate', 'log'))
if algo == 'log' and version() < 3000: if algo == 'log' and version() < 3000:
...@@ -1340,62 +1100,13 @@ class GpuDnnSoftmaxBase(DnnBase): ...@@ -1340,62 +1100,13 @@ class GpuDnnSoftmaxBase(DnnBase):
assert(mode in ('instance', 'channel')) assert(mode in ('instance', 'channel'))
self.mode = mode self.mode = mode
self.tensor_descs = [softmax_input
for softmax_input in self.softmax_inputs]
self.tensor_descs.append('softmax_output')
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
if self.direction == 'forward': if self.direction == 'forward':
return [shape[0]] return [shape[0]]
else: else:
return [shape[1]] return [shape[1]]
def _define_tensor_desc(self, name, id): def get_op_params(self):
return """
cudnnTensorDescriptor_t %(id)s_%(name)s;
""" % dict(name=name, id=id)
def _init_tensor_desc(self, name, id, fail):
return """
%(id)s_%(name)s = NULL;
if ((err%(name)s = cudnnCreateTensorDescriptor(&%(id)s_%(name)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor : %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(name=name, id=id, fail=fail)
def _clean_tensor_desc(self, name, id):
return """
if(%(id)s_%(name)s!= NULL)
cudnnDestroyTensorDescriptor(%(id)s_%(name)s);
""" % dict(name=name, id=id)
def c_support_code_struct(self, node, name):
result = ''
for id in self.tensor_descs:
result += self._define_tensor_desc(name, id)
return result
def c_init_code_struct(self, node, name, sub):
result = """
cudnnStatus_t err%(name)s;
""" % dict(name=name)
for id in self.tensor_descs:
result += self._init_tensor_desc(name, id, sub['fail'])
return result
def c_cleanup_code_struct(self, node, name):
result = ''
for id in self.tensor_descs:
result += self._clean_tensor_desc(name, id)
return result
def c_code(self, node, name, inputs, outputs, sub):
ins = inputs
outs, = outputs
if self.mode == 'instance': if self.mode == 'instance':
mode = "CUDNN_SOFTMAX_MODE_INSTANCE" mode = "CUDNN_SOFTMAX_MODE_INSTANCE"
else: else:
...@@ -1408,49 +1119,7 @@ cudnnStatus_t err%(name)s; ...@@ -1408,49 +1119,7 @@ cudnnStatus_t err%(name)s;
else: else:
algo = "CUDNN_SOFTMAX_ACCURATE" algo = "CUDNN_SOFTMAX_ACCURATE"
result = ['cudnnStatus_t err%s;' % (name,)] return [("SOFTMAX_MODE", mode), ("SOFTMAX_ALGO", algo)]
# Validate the input and build the input variables.
for input_idx, input_name in enumerate(self.softmax_inputs):
result.append("""
if (c_set_tensorNd(%(t)s, %(desc)s) != 0)
%(fail)s
""" % dict(t=ins[input_idx], desc=input_name + "_" + name, fail=sub['fail']))
subs = dict(ins=ins[-1], outs=outs, fail=sub['fail'],
name=name, algo=algo, mode=mode)
for idx, softmax_input in enumerate(self.softmax_inputs):
subs['name%d' % idx] = softmax_input
subs['ins%d' % idx] = inputs[idx]
# Build and prepare the output variable.
result.append("""
if (theano_prep_output(&%(outs)s, PyGpuArray_NDIM(%(ins)s),
PyGpuArray_DIMS(%(ins)s), %(ins)s->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0)
{
%(fail)s
}
if (c_set_tensorNd(%(outs)s, softmax_output_%(name)s) != 0)
%(fail)s
""" % subs)
# Add on a call to the method that does the actual work.
result.append(self.method() % subs)
result.append("""if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}""" % subs)
return '\n'.join(result)
def c_code_cache_version(self):
return (1, version())
def method(self):
raise NotImplementedError('GpuDnnSoftmaxBase::method')
class GpuDnnSoftmax(GpuDnnSoftmaxBase): class GpuDnnSoftmax(GpuDnnSoftmaxBase):
...@@ -1468,34 +1137,15 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1468,34 +1137,15 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
image across 'c'. image across 'c'.
""" """
direction = "forward"
direction = 'forward' file = "dnn_softmax.c"
softmax_inputs = ['softmax_input'] c_func = "APPLY_SPECIFIC(softmax)"
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
assert x.ndim == 4 assert x.ndim == 4
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def method(self):
return """
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxForward(
_handle,
%(algo)s,
%(mode)s,
(void*) &alpha,
softmax_input_%(name)s,
PyGpuArray_DEV_DATA(%(ins)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
"""
def grad(self, inp, grads): def grad(self, inp, grads):
x, = inp x, = inp
g_sm, = grads g_sm, = grads
...@@ -1525,7 +1175,8 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1525,7 +1175,8 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
""" """
direction = 'backward' direction = 'backward'
softmax_inputs = ['softmax_gout', 'softmax_input'] file = "dnn_softmax_grad.c"
c_func = "APPLY_SPECIFIC(softmax_grad)"
def make_node(self, dy, sm): def make_node(self, dy, sm):
dy = as_gpuarray_variable(dy) dy = as_gpuarray_variable(dy)
...@@ -1534,27 +1185,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1534,27 +1185,6 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
assert sm.ndim == 4 assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
def method(self):
return """
{
const float alpha = 1.;
const float beta = 0.;
err%(name)s = cudnnSoftmaxBackward(
_handle,
%(algo)s,
%(mode)s,
(void*) &alpha,
%(name1)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins1)s),
%(name0)s_%(name)s,
PyGpuArray_DEV_DATA(%(ins0)s),
(void*) &beta,
softmax_output_%(name)s,
PyGpuArray_DEV_DATA(%(outs)s)
);
}
"""
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. # @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
...@@ -1717,7 +1347,7 @@ def local_pool_dnn_grad_stride(node): ...@@ -1717,7 +1347,7 @@ def local_pool_dnn_grad_stride(node):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, out, inp_grad = node.inputs inp, out, out_grad = node.inputs
ds = node.op.ds ds = node.op.ds
st = node.op.st st = node.op.st
pad = node.op.padding pad = node.op.padding
...@@ -1726,7 +1356,7 @@ def local_pool_dnn_grad_stride(node): ...@@ -1726,7 +1356,7 @@ def local_pool_dnn_grad_stride(node):
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)() desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
return GpuDnnPoolGrad()(gpu_contiguous(inp), return GpuDnnPoolGrad()(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
gpu_contiguous(inp_grad), gpu_contiguous(out_grad),
desc) desc)
...@@ -1737,18 +1367,19 @@ def local_avg_pool_dnn_grad_stride(node): ...@@ -1737,18 +1367,19 @@ def local_avg_pool_dnn_grad_stride(node):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
inp, inp_grad = node.inputs inp, out_grad = node.inputs
ds = node.op.ds ds = node.op.ds
st = node.op.st st = node.op.st
pad = node.op.padding pad = node.op.padding
mode = node.op.mode mode = node.op.mode
cg = gpu_contiguous(out_grad)
desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)() desc = GpuDnnPoolDesc(ws=ds, stride=st, mode=mode, pad=pad)()
contiguous_inp_grad = gpu_contiguous(inp_grad) # We reuse cg because CuDNN does not use the value of the `out`
return GpuDnnPoolGrad()(gpu_contiguous(inp), # argument but still checks its shape for average pooling. This
contiguous_inp_grad, # has been observed in v2 and v3 as far as I know.
contiguous_inp_grad, return GpuDnnPoolGrad()(gpu_contiguous(inp), cg, cg, desc)
desc)
@register_opt('cudnn') @register_opt('cudnn')
......
#section support_code #section support_code
static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
...@@ -99,15 +98,21 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -99,15 +98,21 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
#section init_code #section init_code
{ setup_ext_cuda();
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { #section support_code_struct
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err)); cudnnHandle_t _handle;
#if PY_MAJOR_VERSION >= 3
return NULL; #section init_code_struct
#else
return; cuda_enter(pygpu_default_context()->ctx);
#endif cudnnStatus_t err;
} _handle = NULL;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(pygpu_default_context()->ctx);
FAIL;
} }
cuda_exit(pygpu_default_context()->ctx);
...@@ -10,6 +10,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -10,6 +10,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -43,8 +44,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -43,8 +44,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, om->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
...@@ -55,6 +55,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -55,6 +55,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
/* Static variables are only initialized once so this will not /* Static variables are only initialized once so this will not
* reset the previous algo every time */ * reset the previous algo every time */
...@@ -86,6 +87,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -86,6 +87,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
...@@ -96,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -96,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU: %s\n", "memory information on the GPU: %s\n",
cudaGetErrorString(err2)); cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -107,6 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -107,6 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -145,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -145,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -167,6 +172,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -167,6 +172,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
"are padded such that the padded inputs are larger " "are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN " "than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue."); "to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -175,7 +181,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -175,7 +181,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -187,6 +192,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -187,6 +192,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting worksize: %s", "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -196,11 +202,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -196,11 +202,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
* to place a nice get_work_mem() function in. * to place a nice get_work_mem() function in.
*/ */
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -218,6 +224,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -218,6 +224,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
} }
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
......
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, im->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*input, im)) if (beta != 0.0 && pygpu_move(*input, im))
return 1; return 1;
...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
static int reuse_algo = 0; static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO; static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
...@@ -83,6 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -83,6 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -94,6 +97,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -94,6 +97,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudaGetLastError(); cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2)); "information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -104,6 +108,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -104,6 +108,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -136,6 +141,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -136,6 +141,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -149,7 +155,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -149,7 +155,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
...@@ -158,15 +163,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -158,15 +163,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -183,6 +189,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -183,6 +189,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, km->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
static int reuse_algo = 0; static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
...@@ -84,6 +86,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -84,6 +86,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -95,6 +98,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -95,6 +98,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudaGetLastError(); cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2)); "information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -106,6 +110,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -106,6 +110,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -138,6 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -138,6 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -151,7 +157,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -151,7 +157,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
...@@ -160,14 +165,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -160,14 +165,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -184,6 +190,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -184,6 +190,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) {
cudnnStatus_t err;
size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
return 1;
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims;
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
}
dims[0] = PyGpuArray_DIM(img, 0);
dims[1] = PyGpuArray_DIM(img, 1);
dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alpha = 1;
const float beta = 0;
cuda_enter(c->ctx);
err = cudnnPoolingForward(
_handle, desc,
&alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
&beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
{
cudnnStatus_t err;
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous input gradients are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
return 1;
}
if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
return 1;
if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
return 1;
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), out->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
return 1;
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
{
const float alpha = 1;
const float beta = 0;
cuda_enter(c->ctx);
err = cudnnPoolingBackward(
_handle, desc,
&alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
&beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(x),
PyGpuArray_DIMS(x), x->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alpha = 1.;
const float beta = 0.;
cuda_enter(c->ctx);
err = cudnnSoftmaxForward(
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
(void *)&alpha,
APPLY_SPECIFIC(input),
PyGpuArray_DEV_DATA(x),
(void *)&beta,
APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
cudnnTensorDescriptor_t APPLY_SPECIFIC(out);
#section init_code_struct
APPLY_SPECIFIC(dy) = NULL;
APPLY_SPECIFIC(sm) = NULL;
APPLY_SPECIFIC(out) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(out));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(dy) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
if (APPLY_SPECIFIC(sm) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
if (APPLY_SPECIFIC(out) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(out));
#section support_code_struct
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(dy),
PyGpuArray_DIMS(dy), dy->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(out)) != 0)
return 1;
{
const float alpha = 1.;
const float beta = 0.;
cuda_enter(c->ctx);
err = cudnnSoftmaxBackward(
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
(void *)&alpha,
APPLY_SPECIFIC(sm),
PyGpuArray_DEV_DATA(sm),
APPLY_SPECIFIC(dy),
PyGpuArray_DEV_DATA(dy),
(void*) &beta,
APPLY_SPECIFIC(out),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -207,11 +207,10 @@ def test_pooling(): ...@@ -207,11 +207,10 @@ def test_pooling():
(32, 1, 147, 197), (32, 1, 147, 197),
]: ]:
data = numpy.random.normal(0, 1, shp).astype("float32") data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__() a = f1(data)
b = f2(data)
b = f2(data).__array__() utt.assert_allclose(a, b)
assert numpy.allclose(a, b,
atol=numpy.finfo(numpy.float32).eps)
# Test the grad # Test the grad
for shp in [(1, 1, 2, 2), for shp in [(1, 1, 2, 2),
...@@ -228,9 +227,9 @@ def test_pooling(): ...@@ -228,9 +227,9 @@ def test_pooling():
def fn(x): def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True, return max_pool_2d(x, (ws, ws), ignore_border=True,
padding=pad, mode=mode) padding=pad, mode=mode)
theano.tests.unittest_tools.verify_grad(fn, [data], utt.verify_grad(fn, [data],
cast_to_output_type=False, cast_to_output_type=False,
mode=mode_with_gpu) mode=mode_with_gpu)
# Confirm that the opt would have inserted it. # Confirm that the opt would have inserted it.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -245,10 +244,9 @@ def test_pooling(): ...@@ -245,10 +244,9 @@ def test_pooling():
pad=pad, pad=pad,
mode=mode) mode=mode)
return dnn_op return dnn_op
theano.tests.unittest_tools.verify_grad( utt.verify_grad(fn, [data],
fn, [data], cast_to_output_type=False,
cast_to_output_type=False, mode=mode_with_gpu)
mode=mode_with_gpu)
# Confirm that we get the good op. # Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -256,7 +254,7 @@ def test_pooling(): ...@@ -256,7 +254,7 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
g_out = fg(data) g_out = fg(data)
# Compare again the CPU result # Compare against the CPU result
out = max_pool_2d(x, (ws, ws), out = max_pool_2d(x, (ws, ws),
padding=pad, padding=pad,
ignore_border=True, mode=mode) ignore_border=True, mode=mode)
...@@ -269,7 +267,7 @@ def test_pooling(): ...@@ -269,7 +267,7 @@ def test_pooling():
assert any([isinstance(node.op, AveragePoolGrad) assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
c_out = fc(data) c_out = fc(data)
assert numpy.allclose(c_out, g_out) utt.assert_allclose(c_out, g_out)
def test_pooling_opt(): def test_pooling_opt():
...@@ -703,7 +701,7 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -703,7 +701,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
out = f(data) out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
assert numpy.allclose(out, gout), numpy.absolute(out - gout) utt.assert_allclose(out, gout)
x = T.matrix('x', 'float32') x = T.matrix('x', 'float32')
x_gpu = T.tensor4('x_gpu', 'float32') x_gpu = T.tensor4('x_gpu', 'float32')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论