提交 3929a557 authored 作者: Ying Zhang's avatar Ying Zhang 提交者: Arnaud Bergeron

GpuAdvancedSubtensor

上级 58e93f9b
......@@ -52,6 +52,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY, gpu_ca_reduce_cuda)
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20)
......@@ -975,6 +976,12 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor])
def local_gpua_advanced_subtensor_(node, context_name):
return GpuAdvancedSubtensor()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
......
......@@ -472,7 +472,122 @@ if (err != GA_NO_ERROR) {
return (0,)
class GpuAdvancedIncSubtensor1(Op):
class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
"""
AdvancedSubtensor On the GPU.
"""
def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = tensor.AdvancedSubtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
out, = out_
x = inputs[0]
idx = inputs[1:]
assert len(idx) >= x.ndim
dims = len(idx)
# step 1: find smallest index
for k, i in enumerate(idx):
if isinstance(i, numpy.ndarray):
start = k
break
for k, i in enumerate(idx[::-1]):
if isinstance(i, numpy.ndarray):
end = len(idx) - k
break
# step 2: transpose
def get_indices(a, b, ind):
"""
Get real indices for a list of indices.
"""
dimshuffle_info = []
new_ind = []
k = 0
for i in range(0, a):
if isinstance(ind[i], slice):
dimshuffle_info.append(k)
new_ind.append(ind[i])
k += 1
elif ind[i] is None:
dimshuffle_info.append('x')
new_ind.append(slice(None))
dimshuffle_info.append(k)
new_ind.append(ind[a])
k += 1
idx_1 = []
idx_2 = []
idx_3 = []
for i in range(a+1, b):
if isinstance(ind[i], slice):
idx_1.append(k)
idx_2.append(ind[i])
k += 1
elif ind[i] is None:
idx_3.append('x')
new_ind.append(slice(None))
else:
idx_3.append(k)
new_ind.append(ind[i])
k += 1
valid_end = a + len(idx_3) + 1
dimshuffle_info.extend(idx_3)
new_ind += idx_2
dimshuffle_info.extend(idx_1)
for i in range(b, len(ind)):
if isinstance(ind[i], slice):
dimshuffle_info.append(k)
new_ind.append(ind[i])
k += 1
elif ind[i] is None:
dimshuffle_info.append('x')
new_ind.append(slice(None))
return dimshuffle_info, new_ind, valid_end
(dimshuffle_idx, new_ind,
end_) = get_indices(start, end, idx)
x = x.transpose(*dimshuffle_idx)
# step 3: partial flattening
start_ = start
shape = (x.shape[: start_] +
(tensor.prod(x.shape[start: end_]),) +
x.shape[end_:])
input_flat = tensor.reshape(x, shape)
# step 4: build the strides
strides = [1]
for i in range(start_, end_-1)[::-1]:
stride = x.shape[i+1] * strides[-1]
strides.append(stride)
# step 5: build the indices into x_flat
items = [new_ind[i] if isinstance(new_ind[i], numpy.ndarray)
else 0 for i in range(start_, end_)]
new_idx = tensor.sum([i * j for i,j
in zip(items, strides[::-1])],
axis=0)
# step 6: advanced slicing
out_flat = input_flat.take(new_idx.flatten())
# step 7: reshape into right shape
out_flat_shp = (x.shape[:start_] +
new_idx.shape + x.shape[end_:]).astype('int32')
o = out_flat.reshape(out_flat_shp,
ndim=dims+new_idx.ndim-2)
idx_ = (new_ind[:start_] + [slice(None)] *
(new_idx.ndim - 2 + end_ - start_) + new_ind[end_:])
out[0] = o.__getitem__(idx_)
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu.
......
......@@ -10,9 +10,9 @@ from ..basic_ops import HostFromGpu, GpuFromHost
from ..elemwise import GpuDimShuffle
from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedSubtensor,
GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor
from .config import mode_with_gpu
......@@ -87,3 +87,43 @@ def test_incsub_f16():
rep = xval.copy()
rep[1:] += yval
assert numpy.allclose(rval, rep)
class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
def shortDescription(self):
return None
def __init__(self, name):
test_subtensor.TestAdvancedSubtensor.__init__(
self, name,
shared=gpuarray_shared_constructor,
sub=GpuAdvancedSubtensor,
inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
# GPU opt can't run in fast_compile only.
self.fast_compile = False
assert self.sub == GpuAdvancedSubtensor
def test_adv_subtensor():
"""Test the advancedsubtensor on gpu."""
shp = (2, 3, 4)
shared = gpuarray_shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype=theano.config.floatX).reshape(shp)
idx1, idx2 = tensor.ivectors('idx1', 'idx2')
idxs = [idx1, slice(0, 2, 1), idx2]
x = shared(xval, name='x')
expr = x[idxs]
f = theano.function([idx1, idx2], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedSubtensor)
for node in f.maker.fgraph.toposort()]) == 1
idx1_val = [0, 1]
idx2_val = [0, 1]
rval = f(idx1_val, idx2_val)
rep = xval[idx1_val, slice(0, 2, 1), idx2_val]
assert numpy.allclose(rval, rep)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论