提交 8e3ffa84 authored 作者: notoraptor's avatar notoraptor

Wrap Op params for theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1:

- inplace (bool scalar) - set_instead_of_inc (bool scalar) - context (gpu_context_type) - ndim_input_0 (size_t scalar) - ndim_input_1 (size_t scalar) - typecode_input_0 (integer scalar) - typecode_input_1 (integer scalar) The 4 last params are used to reduce C code variability into c_init_code_struct(). Wrap Op params for theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20: Same as theano.gpuarray.subtensor.GpuAdvancedIncSubtensor1.
上级 7ef73d21
...@@ -7,9 +7,11 @@ from six import integer_types ...@@ -7,9 +7,11 @@ from six import integer_types
from six.moves import StringIO from six.moves import StringIO
from theano import tensor, gof, Op from theano import tensor, gof, Op
from theano.gof import ParamsType
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
import theano.tensor as T import theano.tensor as T
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.scalar import bool as bool_t, int32 as int_t, uint32 as size_t
try: try:
import pygpu import pygpu
...@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -594,7 +596,15 @@ class GpuAdvancedIncSubtensor1(Op):
""" """
_f16_ok = True _f16_ok = True
__props__ = ('inplace', 'set_instead_of_inc') __props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type params_type = ParamsType(inplace=bool_t,
set_instead_of_inc=bool_t,
context=gpu_context_type,
# following params are used into c_init_code_struct(),
# as inputs are not available in that function.
ndim_input_0=size_t,
ndim_input_1=size_t,
typecode_input_0=int_t,
typecode_input_1=int_t)
def __init__(self, inplace=False, set_instead_of_inc=False): def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace self.inplace = inplace
...@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -634,12 +644,17 @@ class GpuAdvancedIncSubtensor1(Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node): def get_params(self, node):
return node.outputs[0].type.context return self.params_type.get_params(self, context=node.outputs[0].type.context,
# following params are used into c_init_code_struct().
ndim_input_0=node.inputs[0].ndim,
ndim_input_1=node.inputs[1].ndim,
typecode_input_0=node.inputs[0].type.typecode,
typecode_input_1=node.inputs[1].type.typecode)
# We can't use the parent version that loops on each index # We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the # as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case. # parent doesn't loop in that case.
def perform(self, node, inp, out_, ctx=None): def perform(self, node, inp, out_, params=None):
# TODO opt to make this inplace # TODO opt to make this inplace
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
...@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -700,21 +715,18 @@ class GpuAdvancedIncSubtensor1(Op):
return """ return """
gpuelemwise_arg args[2] = {{0}}; gpuelemwise_arg args[2] = {{0}};
args[0].name = "a"; args[0].name = "a";
args[0].typecode = %(type1)s; args[0].typecode = %(params)s->typecode_input_0;
args[0].flags = GE_READ|GE_WRITE; args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b"; args[1].name = "b";
args[1].typecode = %(type2)s; args[1].typecode = %(params)s->typecode_input_1;
args[1].flags = GE_READ; args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b", iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
2, args, %(nd)s, GE_CONVERT_F16); 2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
if (iadd == NULL) { if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support"); PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s %(fail)s
} }
""" % dict(ctx=sub['params'], fail=sub['fail'], """ % dict(params=sub['params'], fail=sub['fail'])
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim): if (node.inputs[0].ndim != node.inputs[1].ndim):
...@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -722,18 +734,26 @@ class GpuAdvancedIncSubtensor1(Op):
return """ return """
PyGpuArrayObject *row_x, *row_y; PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s]; size_t nd = %(params)s->ndim_input_0;
ssize_t *start = NULL, *step = NULL;
size_t num_indices, j; size_t num_indices, j;
int ret; int ret;
int broadcast_y; int broadcast_y;
for (j = 0; j < %(nd)s; j++) { start = (ssize_t*)malloc(nd * sizeof(ssize_t));
step = (ssize_t*)malloc(nd * sizeof(ssize_t));
if (start == NULL || step == NULL) {
PyErr_NoMemory();
%(fail)s
}
for (j = 0; j < nd; ++j) {
start[j] = 0; start[j] = 0;
step[j] = 1; step[j] = 1;
} }
step[0] = 0; step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s); num_indices = PyArray_SIZE(%(ind)s);
if (!%(inplace)s) { if (!%(params)s->inplace) {
%(out)s = theano_try_copy(%(out)s, %(x)s); %(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
// Exception already set // Exception already set
...@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -774,7 +794,7 @@ class GpuAdvancedIncSubtensor1(Op):
%(fail)s; %(fail)s;
} }
if (%(set_instead_of_inc)s) { if (%(params)s->set_instead_of_inc) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga); ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else { } else {
void *args[2]; void *args[2];
...@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op): ...@@ -788,13 +808,21 @@ class GpuAdvancedIncSubtensor1(Op):
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements"); PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
} }
} }
free(start);
free(step);
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace), params=sub['params'],
nd=node.inputs[0].ndim, fail="""
set_instead_of_inc=int(self.set_instead_of_inc)) {
free(start);
free(step);
%(fail)s
}
""" % dict(fail=sub['fail']))
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
...@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -805,6 +833,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
""" """
_f16_ok = True _f16_ok = True
params_type = GpuAdvancedIncSubtensor1.params_type
get_params = GpuAdvancedIncSubtensor1.get_params
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
""" """
...@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -837,14 +867,11 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node): def perform(self, node, inp, out, params):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out) return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self): def c_code_cache_version(self):
return (12,) return (13,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>', return ['<numpy_compat.h>', '<gpuarray_helper.h>',
...@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -854,7 +881,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
return [os.path.dirname(__file__)] return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
ctx = self.get_params(node) ctx = self.get_params(node).context
if ctx.kind != b'cuda': if ctx.kind != b'cuda':
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or if (node.inputs[0].ndim != node.inputs[1].ndim or
...@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC, ...@@ -862,16 +889,9 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
int(ctx.bin_id[-2]) < 2): int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
y = inputs[1]
ind = inputs[2]
out = outputs[0]
fail = sub['fail']
set_instead_of_inc = int(self.set_instead_of_inc)
inplace = int(self.inplace)
return """ return """
int err; int err;
if (%(inplace)s) { if (%(params)s->inplace) {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(x)s; %(out)s = %(x)s;
Py_INCREF(%(out)s); Py_INCREF(%(out)s);
...@@ -882,25 +902,19 @@ if (!%(out)s) { ...@@ -882,25 +902,19 @@ if (!%(out)s) {
// Exception already set // Exception already set
%(fail)s %(fail)s
} }
if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(set_instead_of_inc)s)) { if (GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s, %(params)s->set_instead_of_inc)) {
%(fail)s %(fail)s
} }
""" % locals() """ % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0], fail=sub['fail'], params=sub['params'])
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
type_x = gpuarray.dtype_to_ctype(dtype_x) type_x = gpuarray.dtype_to_ctype(dtype_x)
type_y = gpuarray.dtype_to_ctype(dtype_y) type_y = gpuarray.dtype_to_ctype(dtype_y)
type_ind = gpuarray.dtype_to_ctype(dtype_ind) type_ind = gpuarray.dtype_to_ctype(dtype_ind)
type_out = gpuarray.dtype_to_ctype(dtype_out) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind)
kname = "k_vector_add_fast" kname = "k_vector_add_fast"
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
code = """ code = """
...@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1010,7 +1024,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return; return;
} }
""" % locals() """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind)
params = [ params = [
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp',
...@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1020,26 +1034,19 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
def c_support_code_struct(self, node, nodename): def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
dtype_out = node.outputs[0].dtype
itemsize_x = np.dtype(dtype_x).itemsize
itemsize_y = np.dtype(dtype_y).itemsize
itemsize_ind = np.dtype(dtype_ind).itemsize
itemsize_out = np.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr, PyGpuArrayObject* indices_arr,
const int set_instead_of_inc) const int set_instead_of_inc)
{ {
size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(py_self)[1], (size_t)256), 1, 1};
size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1}; size_t n_blocks[3] = {std::min(PyGpuArray_SIZE(indices_arr), (size_t)4096), 1, 1};
gpudata *errbuf; gpudata *errbuf;
int err, kerr = 0; int err, kerr = 0;
size_t itemsize_x = GpuArray_ITEMSIZE(&py_self->ga);
size_t itemsize_y = GpuArray_ITEMSIZE(&py_other->ga);
size_t itemsize_ind = GpuArray_ITEMSIZE(&indices_arr->ga);
if (threads_per_block[0] > 0 && n_blocks[0] > 0) { if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
err = gpudata_property(py_self->ga.data, err = gpudata_property(py_self->ga.data,
...@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1049,11 +1056,11 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
return 1; return 1;
} }
ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s; ssize_t stride_X0 = PyGpuArray_STRIDES(py_self)[0] / itemsize_x;
ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s; ssize_t stride_X1 = PyGpuArray_STRIDES(py_self)[1] / itemsize_x;
ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s; ssize_t stride_Y0 = PyGpuArray_DIMS(py_other)[0] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[0] / itemsize_y;
ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s; ssize_t stride_Y1 = PyGpuArray_DIMS(py_other)[1] == 1 ? 0 : PyGpuArray_STRIDES(py_other)[1] / itemsize_y;
ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s; ssize_t stride_ind = PyGpuArray_STRIDES(indices_arr)[0] / itemsize_ind;
void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0], void *kernel_params[] = {(void *)&PyGpuArray_DIMS(py_self)[0],
(void *)&PyGpuArray_DIMS(py_self)[1], (void *)&PyGpuArray_DIMS(py_self)[1],
(void *)&stride_X0, (void *)&stride_X0,
...@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1093,7 +1100,7 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
} }
return 0; return 0;
} }
""" % locals() """ % dict(k_var="k_vector_add_fast_" + nodename)
class GpuExtractDiag(Op): class GpuExtractDiag(Op):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论