提交 c0342e58 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add GpuAdvancedSubtensor1, based on libgpuarray _take1().

上级 5ecbbde2
......@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
......@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node):
node.op.destroyhandler_tolerate_aliased)
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node):
......
......@@ -405,6 +405,77 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
return parent_version + elemwise_version + (2,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
def make_node(self, x, ilist):
x_ = as_gpuarray_variable(x)
ilist__ = tensor.as_tensor_variable(ilist)
if ilist__.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers')
if ilist__.type.dtype != 'int64':
ilist__ = tensor.cast(ilist__, 'int64')
ilist_ = as_gpuarray_variable(ilist__)
if ilist_.type.dtype != 'int64':
raise TypeError('index must be int64')
if ilist_.type.ndim != 1:
raise TypeError('index must be a vector')
if x_.type.ndim == 0:
raise TypeError('cannot index into a scalar')
bcast = ilist_.broadcastable + x_.broadcastable[1:]
return gof.Apply(self, [x_, ilist_],
[GpuArrayType(dtype=x.dtype,
broadcastable=bcast)()])
def perform(self, node, inp, out_):
raise NotImplementedError()
def c_support_code(self):
return """
int take1_match_dims(GpuArray *a, GpuArray *v) {
if (a->nd != v->nd) return 0;
for (unsigned int i = 1; i < v->nd; i++) {
if (a->dimensions[i] != v->dimensions[i]) return 0;
}
return 1;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
return """
int err;
if (%(out)s == NULL || !GpuArray_IS_C_CONTIGUOUS(&%(out)s->ga) ||
%(out)s->ga.dimensions[0] != %(idx)s->ga.dimensions[0] ||
!take1_match_dims(&%(out)s->ga, &%(v)s->ga)) {
size_t tmp;
Py_XDECREF(%(out)s);
/* This is a dirty hack to avoid an extra alloc */
tmp = %(v)s->ga.dimensions[0];
%(v)s->ga.dimensions[0] = %(idx)s->ga.dimensions[0];
%(out)s = pygpu_empty(%(v)s->ga.nd, %(v)s->ga.dimensions, %(v)s->ga.typecode,
GA_C_ORDER, %(v)s->context, Py_None);
%(v)s->ga.dimensions[0] = tmp; // Don't remove this line
}
err = GpuArray_take1(&%(out)s->ga, &%(v)s->ga, &%(idx)s->ga, 1);
if (err != GA_NO_ERROR) {
if (err == GA_VALUE_ERROR) {
PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
} else {
PyErr_SetString(PyExc_RuntimeError, Gpu_error(%(v)s->context->ops,
%(v)s->context->ctx, err));
}
%(fail)s
}
""" % dict(out=outputs[0], v=inputs[0], idx=inputs[1], fail=sub['fail'])
def c_code_cache_version(self):
return (0,)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu.
......
......@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor
from ..basic_ops import HostFromGpu, GpuFromHost
from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor
......@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor):
shared=gpuarray_shared_constructor,
sub=GpuSubtensor,
inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1,
mode=mode_with_gpu,
# avoid errors with limited devices
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论