提交 d2cd02d0 authored 作者: Shawn Tan's avatar Shawn Tan

Initial additions for `GpuAdvancedIncSubtensor`

- `theano/gpuarray/opt.py` added optimisation to include new op - `theano/gpuarray/subtensor.py` added op - `theano/gpuarray/tests/test_subtensor.py` added simple test case
上级 3796417a
...@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, ...@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
...@@ -1064,32 +1065,36 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs): ...@@ -1064,32 +1065,36 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1, tensor.AdvancedIncSubtensor])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1, tensor.AdvancedIncSubtensor], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs): def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
context = get_context(context_name) if isinstance(op, (tensor.AdvancedIncSubtensor1)):
# This is disabled on non-cuda contexts context = get_context(context_name)
if context.kind != b'cuda': # This is disabled on non-cuda contexts
return None if context.kind != b'cuda':
return None
x, y, ilist = inputs x, y, ilist = inputs
set_instead_of_inc = op.set_instead_of_inc set_instead_of_inc = op.set_instead_of_inc
compute_capability = int(context.bin_id[-2]) compute_capability = int(context.bin_id[-2])
if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0: if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0:
x = x.dimshuffle(0, 'x') x = x.dimshuffle(0, 'x')
y = y.dimshuffle('x', 'x') y = y.dimshuffle('x', 'x')
ret = GpuAdvancedIncSubtensor1_dev20( ret = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc)(x, y, ilist) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret) ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
return ret return ret
elif compute_capability < 2 or x.ndim != 2 or y.ndim != 2: elif compute_capability < 2 or x.ndim != 2 or y.ndim != 2:
return GpuAdvancedIncSubtensor1( return GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else: else:
return GpuAdvancedIncSubtensor1_dev20( return GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
elif isinstance(op, (tensor.AdvancedIncSubtensor)):
return GpuAdvancedIncSubtensor()
@register_inplace() @register_inplace()
......
...@@ -587,6 +587,101 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor): ...@@ -587,6 +587,101 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
out[0] = o out[0] = o
class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
"""
Implement AdvancedIncSubtensor on the gpu.
"""
def make_node(self, x, y, *inputs):
ctx_name = infer_context_name(x)
rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_, ctx=None):
out, = out_
x = inp[0]
y = inp[1]
idx = inp[2:]
x = x.copy()
# detect and transpose array indices
nidx = []
nshp = list(x.shape)
for k, i in enumerate(idx):
if i is None:
nidx.append(slice(None))
nshp.insert(k, 1)
else:
nidx.append(i)
x_ = x.reshape(nshp)
narrays = 0
transp = list(range(x_.ndim))
p = 0
# ap gives the position of the array in case there is only one.
# if there are more than one (narray > 1) it should be ignored.
ap = 0
for k, i in enumerate(list(nidx)):
if (isinstance(i, np.ndarray) and
i.ndim != 0):
transp.remove(k)
transp.insert(p, k)
ap += k
i = nidx.pop(k)
nidx.insert(p, i)
p += 1
narrays += 1
else:
if narrays == 0:
try:
i.__index__()
# We shift back the position of the array by the
# number of dimensions that are removed by
# indexing. If ap is bigger than 0 it means we
# have encountered at least one array.
if ap >= 0:
ap -= 1
# If this index is before the first array then
# we will not move the array back to its
# position. Mark this by faking that there
# are more than two arrays. This is crazy
# numpy behaviour so blame them.
narrays = 2
except Exception:
pass
x_ = x_.transpose(*transp)
idx_ = ([slice(None)] * p + nidx[p:])
x_ = x_.__getitem__(idx_)
# flatten the array-indexed dimensions
shape = ((np.prod(x_.shape[0: p]),) +
x_.shape[p:])
x_flat = x_.reshape(shape)
# build the strides
strides = [1]
for i in range(p - 1, 0, -1):
stride = x_.shape[i] * strides[0]
strides.insert(0, stride)
# build the indices and use it
take_idx = sum((i * s for i, s in zip(nidx, strides)))
k = get_iadd(node.inputs[0], node.inputs[1])
y = pygpu.asarray(y, context=x_flat.context)
for j, i in enumerate(take_idx):
k(x_flat[i], y[j], broadcast=True)
out[0] = x
class GpuAdvancedIncSubtensor1(Op): class GpuAdvancedIncSubtensor1(Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
......
...@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle ...@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle
from ..subtensor import (GpuIncSubtensor, GpuSubtensor, from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1_dev20,
GpuExtractDiag, GpuExtractDiag,
...@@ -76,6 +77,27 @@ class G_subtensorF16(test_subtensor.T_subtensor): ...@@ -76,6 +77,27 @@ class G_subtensorF16(test_subtensor.T_subtensor):
assert self.sub == GpuSubtensor assert self.sub == GpuSubtensor
def test_advinc_subtensor():
shp = (3, 3, 3)
shared = gpuarray_shared_constructor
xval = np.arange(np.prod(shp), dtype='float32').reshape(shp) + 1
yval = np.arange(np.prod(shp[1:]), dtype='float32').reshape(shp[1:])
idx = ([0, 1, 2], [0, 1, 2])
x = shared(xval, name='x')
y = tensor.tensor(dtype='float32',
broadcastable=(False, False),
name='y')
expr = tensor.advanced_inc_subtensor(x, y, *idx)
f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedIncSubtensor)
for node in f.maker.fgraph.toposort()]) == 1
rval = f(yval)
rep = xval.copy()
rep[idx] += yval
assert np.allclose(rval, rep)
>>>>>>> Initial additions for `GpuAdvancedIncSubtensor`
def test_advinc_subtensor1(): def test_advinc_subtensor1():
# Test the second case in the opt local_gpu_advanced_incsubtensor1 # Test the second case in the opt local_gpu_advanced_incsubtensor1
for shp in [(3, 3), (3, 3, 3)]: for shp in [(3, 3), (3, 3, 3)]:
...@@ -199,6 +221,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor): ...@@ -199,6 +221,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
self, name, self, name,
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuAdvancedSubtensor, sub=GpuAdvancedSubtensor,
inc_sub=GpuAdvancedIncSubtensor,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', # floatX? dtype='float32', # floatX?
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论