提交 8e3ffa84 authored 作者: notoraptor's avatar notoraptor

Wrap Op params for theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1:

- inplace (bool scalar) - set_instead_of_inc (bool scalar) - context (gpu_context_type) - ndim_input_0 (size_t scalar) - ndim_input_1 (size_t scalar) - typecode_input_0 (integer scalar) - typecode_input_1 (integer scalar) The 4 last params are used to reduce C code variability into c_init_code_struct(). Wrap Op params for theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20: Same as theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1.
上级 7ef73d21
......@@ -7,9 +7,11 @@ from six import integer_types
from six.moves import StringIO
from theano import tensor, gof, Op
from theano.gof import ParamsType
from theano.gradient import grad_not_implemented
import theano.tensor as T
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
try:
import pygpu
......@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
"""
_f16_ok = True
__props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type
params_type = ParamsType(inplace=bool_t,
set_instead_of_inc=bool_t,
context=gpu_context_type,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0=size_t,
ndim_input_1=size_t,
typecode_input_0=int_t,
typecode_input_1=int_t)
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
......@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return node.outputs[0].type.context
return self.params_type.get_params(self, context=node.outputs[0].type.context,
# following params are used into c_init_code_struct().
ndim_input_0=node.inputs[0].ndim,
ndim_input_1=node.inputs[1].ndim,
typecode_input_0=node.inputs[0].type.typecode,
typecode_input_1=node.inputs[1].type.typecode)
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_, ctx=None):
def perform(self, node, inp, out_, params=None):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
......@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].typecode = %(params)s->typecode_input_0;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].typecode = %(params)s->typecode_input_1;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16);
iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s
}
""" % dict(ctx=sub['params'], fail=sub['fail'],
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
""" % dict(params=sub['params'], fail=sub['fail'])
def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim):
......@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
return """
PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s];
size_t nd = %(params)s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j;
int ret;
int broadcast_y;
for (j = 0; j < %(nd)s; j++) {
start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)s
}
for (j = 0; j < nd; ++j) {
start[j] = 0;
step[j] = 1;
}
step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s);
if (!%(inplace)s) {
if (!%(params)s->inplace) {
%(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL) {
// Exception already set
......@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
%(fail)s;
}
if (%(set_instead_of_inc)s) {
if (%(params)s->set_instead_of_inc) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
void *args[2];
......@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
}
free(start);
free(step);
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace),
nd=node.inputs[0].ndim,
set_instead_of_inc=int(self.set_instead_of_inc))
params=sub['params'],
fail="""
{
free(start);
free(step);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self):
return (3,)
return (4,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
......@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
"""
_f16_ok = True
params_type = GpuAdvancedIncSubtensor1.params_type
get_params = GpuAdvancedIncSubtensor1.get_params
def make_node(self, x, y, ilist):
"""
......@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
def perform(self, node, inp, out, params):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self):
return (12,)
return (13,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>',
......@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node)
ctx = self.get_params(node).context
if ctx.kind != b'cuda':
raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or
......@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
set_instead_of_inc = int(self.set_instead_of_inc)
inplace = int(self.inplace)
return """
int err;
if (%(inplace)s) {
if (%(params)s->inplace) {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
......@@ -882,25 +902,19 @@ if (!%(out)s) {
// Exception already set
%(fail)s
}
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) {
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
%(fail)s
}
""" % locals()
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out)
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename
code = """
......@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
return;
}
""" % locals()
""" % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
......@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
flags=flags, objvar=k_var)]
def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr,
PyGpuArrayObject* indices_arr,
const int set_instead_of_inc)
{
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf;
int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = gpudata_property(py_self->ga.data,
......@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1;
}
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s;
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0,
......@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
}
return 0;
}
""" % locals()
""" % dict(k_var="k_vector_add_fast_" + nodename)
class GpuExtractDiag(Op):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论