提交 542aa729 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add C implementation of GpuAdvancedIncSubtensor1.

上级 5f0a8939
...@@ -6,7 +6,7 @@ import numpy ...@@ -6,7 +6,7 @@ import numpy
from six import integer_types from six import integer_types
from six.moves import StringIO from six.moves import StringIO
from theano import tensor, gof from theano import tensor, gof, Op
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
try: try:
...@@ -464,11 +464,25 @@ if (err != GA_NO_ERROR) { ...@@ -464,11 +464,25 @@ if (err != GA_NO_ERROR) {
return (0,) return (0,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
__props__ = ('inplace', 'set_instead_of_inc')
params_type = gpu_context_type
def __init__(self, inplace=False, set_instead_of_inc=False):
self.inplace = inplace
self.set_instead_of_inc = set_instead_of_inc
if inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(
inplace=True,
set_instead_of_inc=self.set_instead_of_inc)
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
ctx_name = infer_context_name(x, y) ctx_name = infer_context_name(x, y)
x_ = as_gpuarray_variable(x, ctx_name) x_ = as_gpuarray_variable(x, ctx_name)
...@@ -496,10 +510,13 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -496,10 +510,13 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_params(self, node):
return node.outputs[0].type.context
# We can't use the parent version that loops on each index # We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the # as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case. # parent doesn't loop in that case.
def perform(self, node, inp, out_): def perform(self, node, inp, out_, ctx=None):
# TODO opt to make this inplace # TODO opt to make this inplace
x, y, idx = inp x, y, idx = inp
out, = out_ out, = out_
...@@ -512,8 +529,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -512,8 +529,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
if len(idx) == 0: if len(idx) == 0:
return return
# Make sure idx is not a GpuArray otherwise we cannot use its content # Make sure idx is not a GpuArray otherwise we cannot use its
# to index x and y # content to index x and y (This is because we serve as
# fallback for _dev20).
if isinstance(idx, gpuarray.GpuArray): if isinstance(idx, gpuarray.GpuArray):
idx = numpy.asarray(idx) idx = numpy.asarray(idx)
...@@ -545,8 +563,115 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -545,8 +563,115 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
for i in idx: for i in idx:
k(x[i], reshaped_y, broadcast=True) k(x[i], reshaped_y, broadcast=True)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
'<gpuarray/elemwise.h>', 'gpuarray_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_support_code_struct(self, node, nodename):
return "\nGpuElemwise *iadd;\n"
def c_init_code_struct(self, node, name, sub):
return """
gpuelemwise_arg args[2] = {{0}};
args[0].name = "a";
args[0].typecode = %(type1)s;
args[0].flags = GE_READ|GE_WRITE;
args[1].name = "b";
args[1].typecode = %(type2)s;
args[1].flags = GE_READ;
iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
2, args, %(nd)s, 0);
if (iadd == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
%(fail)s
}
""" % dict(ctx=sub['params'], fail=sub['fail'],
type1=node.inputs[0].type.typecode,
type2=node.inputs[1].type.typecode,
nd=node.inputs[1].ndim)
def c_code(self, node, name, inputs, outputs, sub):
if (node.inputs[0].ndim != node.inputs[1].ndim):
raise NotImplementedError("This case does not have C code yet.")
return """
PyGpuArrayObject *row_x, *row_y;
ssize_t start[%(nd)s], step[%(nd)s];
size_t num_indices, j;
int ret;
int broadcast_y;
for (j = 0; j < %(nd)s; j++) {
start[j] = 0;
step[j] = 1;
}
step[0] = 0;
num_indices = PyArray_SIZE(%(ind)s);
if ((num_indices - 1) > LONG_MAX) {
PyErr_Format(PyExc_AssertionError,
"num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
%(fail)s
}
if (!%(inplace)s) {
%(out)s = theano_try_copy(%(out)s, %(x)s);
if (%(out)s == NULL)
%(fail)s
} else {
Py_XDECREF(%(out)s);
%(out)s = %(x)s;
Py_INCREF(%(out)s);
}
broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
for (j = 0; j < num_indices; j++) {
start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
if (start[0] < 0)
start[0] += PyGpuArray_DIM(%(out)s, 0);
if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
PyErr_SetString(PyExc_IndexError, "index out of bounds");
%(fail)s;
}
row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
if (row_x == NULL)
%(fail)s;
if (broadcast_y)
start[0] = 0;
else
start[0] = j;
row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
if (row_y == NULL) {
Py_DECREF(row_x);
%(fail)s;
}
if (%(set_instead_of_inc)s) {
ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
} else {
void *args[2];
args[0] = (void *)&row_x->ga;
args[1] = (void *)&row_y->ga;
ret = GpuElemwise_call(iadd, args, GE_BROADCAST);
}
Py_DECREF(row_x);
Py_DECREF(row_y);
if (ret != GA_NO_ERROR)
PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
}
""" % dict(x=inputs[0], y=inputs[1], ind=inputs[2], out=outputs[0],
fail=sub['fail'], inplace=int(self.inplace),
nd=node.inputs[0].ndim,
set_instead_of_inc=int(self.set_instead_of_inc))
def c_code_cache_version(self):
return ()
class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
GpuAdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu, but use function Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
...@@ -608,7 +733,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -608,7 +733,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
raise NotImplementedError("cuda only") raise NotImplementedError("cuda only")
if (node.inputs[0].ndim != node.inputs[1].ndim or if (node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != 2 or node.inputs[0].ndim != 2 or
ctx.bin_id[-2] < b'2'): int(ctx.bin_id[-2]) < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0] x = inputs[0]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论