提交 4f3a52a5 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5881 from shawntan/issue-930

Implementing `GpuAdvancedIncSubtensor`
...@@ -951,6 +951,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -951,6 +951,7 @@ class GpuAlloc(HideC, Alloc):
(subtensor.GpuIncSubtensor, (subtensor.GpuIncSubtensor,
subtensor.GpuAdvancedIncSubtensor1, subtensor.GpuAdvancedIncSubtensor1,
subtensor.GpuAdvancedIncSubtensor1_dev20, subtensor.GpuAdvancedIncSubtensor1_dev20,
subtensor.GpuAdvancedIncSubtensor,
blas.GpuGemm, blas.GpuGemv, blas.GpuGemm, blas.GpuGemv,
blas.GpuGer) blas.GpuGer)
)): )):
......
...@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, ...@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
...@@ -1066,7 +1067,7 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs): ...@@ -1066,7 +1067,7 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile') @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs): def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
context = get_context(context_name) context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if context.kind != b'cuda': if context.kind != b'cuda':
...@@ -1094,6 +1095,16 @@ def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs): ...@@ -1094,6 +1095,16 @@ def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor])
@register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
if not op.set_instead_of_inc:
return GpuAdvancedIncSubtensor()
else:
return False
@register_inplace() @register_inplace()
@local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20]) @local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20])
def local_advincsub1_gpua_inplace(node): def local_advincsub1_gpua_inplace(node):
......
...@@ -589,6 +589,140 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor): ...@@ -589,6 +589,140 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
out[0] = o out[0] = o
class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
"""
Implement AdvancedIncSubtensor on the gpu.
"""
def make_node(self, x, y, *inputs):
ctx_name = infer_context_name(x, y)
rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()])
def perform(self, node, inp, out_):
out, = out_
x = inp[0]
y = inp[1]
idx = inp[2:]
x = x.copy()
# convert all indices to np.array
for i in range(len(idx)):
if isinstance(idx[i], gpuarray.GpuArray):
idx[i] = np.asarray(idx[i])
# Insert axes for None indexing
nidx = []
nshp = list(x.shape)
for k, i in enumerate(idx):
if i is None:
nidx.append(slice(None))
nshp.insert(k, 1)
else:
nidx.append(i)
x_ = x.reshape(nshp)
# Bring array indices to front
transp = []
nidx_ = []
p = 0
for k, i in enumerate(list(nidx)):
if isinstance(i, np.ndarray) and i.ndim != 0:
transp.append(k)
nidx_.append(i)
p += 1
for k, i in enumerate(list(nidx)):
if not (isinstance(i, np.ndarray) and i.ndim != 0):
transp.append(k)
nidx_.append(i)
transp = transp + list(range(len(transp), x_.ndim))
rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x:x[1])]
nidx = nidx_
# transp: order to shuffle axes of x so that single dimension
# subarrays are extracted first
# p: number of axes with array indexing
x_ = x_.transpose(*transp)
idx_ = ([slice(None)] * p + nidx[p:])
# flatten the array-indexed dimensions
x_flat = x_.reshape((np.prod(x_.shape[0: p]),) + x_.shape[p:])
# process y so that last axes are the same
if y.shape != (1,):
y_shape_reverse = []
for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]):
if x_s == y_s or y_s == 1:
y_shape_reverse.append(y_s)
else:
break
if np.prod(y_shape_reverse) < np.prod(y.shape):
if len(y_shape_reverse) > 0:
y_shape_reverse.append(
int(np.prod(y.shape[0:-len(y_shape_reverse)])))
else:
y_shape_reverse.append(int(np.prod(y.shape)))
y_shape = y_shape_reverse[::-1]
y_flat = y.reshape(y_shape)
else:
y_flat = y[0]
# build the strides
strides = [1]
for i in range(p - 1, 0, -1):
stride = x_.shape[i] * strides[0]
strides.insert(0, stride)
# build the indices and use it
index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1)
take_idx = sum(i * s for i, s in zip(nidx, strides))
if index == []:
for j, i in enumerate(take_idx.flatten()):
if y_flat.shape == ():
val = y_flat
else:
val = y_flat[j]
tmp = pygpu.elemwise.elemwise2(
x_flat[i], '+', val, x_flat[i],
broadcast=True,
convert_f16=True
)
x_flat.__setitem__(i, tmp)
else:
k = get_iadd(node.inputs[0], node.inputs[1])
if x_flat.shape[-len(y_flat.shape):] == y_flat.shape or y_flat.shape == ():
# y_flat has to be broadcast over axes of x_flat[i]
for i in take_idx.flatten():
if len(idx_[p:]) > 0:
x_flat_sub = x_flat[i].__getitem__(index)
else:
x_flat_sub = x_flat[i]
tmp = pygpu.elemwise.elemwise2(
x_flat_sub, '+', y_flat, x_flat_sub,
broadcast=True,
convert_f16=True
)
x_flat[i].__setitem__(index, tmp)
else:
# y_flat's first axis corresponds to first exist of x_flat
for j, i in enumerate(take_idx.flatten()):
if len(idx_[p:]) > 0:
x_flat_sub = x_flat[i].__getitem__(index)
else:
x_flat_sub = x_flat[i]
k(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
out[0] = x_
class GpuAdvancedIncSubtensor1(Op): class GpuAdvancedIncSubtensor1(Op):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
......
...@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle ...@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle
from ..subtensor import (GpuIncSubtensor, GpuSubtensor, from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
GpuAdvancedIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1_dev20,
GpuExtractDiag, GpuExtractDiag,
...@@ -159,6 +160,7 @@ def test_advinc_subtensor1_vector_scalar(): ...@@ -159,6 +160,7 @@ def test_advinc_subtensor1_vector_scalar():
name='y') name='y')
expr = tensor.advanced_inc_subtensor1(x, y, [0, 2]) expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu) f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, (GpuAdvancedIncSubtensor1_dev20, assert sum([isinstance(node.op, (GpuAdvancedIncSubtensor1_dev20,
GpuAdvancedIncSubtensor1)) GpuAdvancedIncSubtensor1))
for node in f.maker.fgraph.toposort()]) == 1 for node in f.maker.fgraph.toposort()]) == 1
...@@ -222,6 +224,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor): ...@@ -222,6 +224,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
self, name, self, name,
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuAdvancedSubtensor, sub=GpuAdvancedSubtensor,
inc_sub=GpuAdvancedIncSubtensor,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', # floatX? dtype='float32', # floatX?
......
...@@ -2482,16 +2482,12 @@ class TestAlloc(unittest.TestCase): ...@@ -2482,16 +2482,12 @@ class TestAlloc(unittest.TestCase):
grad_derp = theano.grad(derp, some_vector) grad_derp = theano.grad(derp, some_vector)
fgrad = theano.function([some_vector], grad_derp, fgrad = theano.function([some_vector], grad_derp,
mode=self.mode) mode=self.mode)
topo_obj = fobj.maker.fgraph.toposort() topo_obj = fobj.maker.fgraph.toposort()
# <= is needed as the GPU currently don't implement
# AdvancedIncSubtensor. When this is the case it can be
# replaced with ==.
assert np.sum([isinstance(node.op, type(alloc_)) assert np.sum([isinstance(node.op, type(alloc_))
for node in topo_obj]) <= 1 for node in topo_obj]) == 0
topo_grad = fgrad.maker.fgraph.toposort()
# print subtensor topo_grad = fgrad.maker.fgraph.toposort()
# theano.printing.debugprint(fgrad)
assert np.sum([isinstance(node.op, type(alloc_)) assert np.sum([isinstance(node.op, type(alloc_))
for node in topo_grad]) == n_alloc, ( for node in topo_grad]) == n_alloc, (
alloc_, subtensor, n_alloc, topo_grad) alloc_, subtensor, n_alloc, topo_grad)
......
...@@ -1386,6 +1386,44 @@ class TestAdvancedSubtensor(unittest.TestCase): ...@@ -1386,6 +1386,44 @@ class TestAdvancedSubtensor(unittest.TestCase):
self.ix2 = lmatrix() self.ix2 = lmatrix()
self.ixr = lrow() self.ixr = lrow()
def test_advinc_subtensor(self):
x_shp = (20, 15, 10, 5)
def check(idx, y_val, x_val, true):
x = self.shared(x_val, name='x')
y = tensor.tensor(dtype='float32',
broadcastable=(False,) * len(y_val.shape),
name='y')
sym_idx = [tensor.as_tensor_variable(ix) for ix in idx]
expr = tensor.advanced_inc_subtensor(x, y, *sym_idx)
f = theano.function([y], expr, mode=self.mode)
rval = f(y_val)
assert np.allclose(rval, true)
idxs_y_shp_pairs = [
((0, [1, 3, 5], 1), (3, 5)),
(([1, 2, 4, 8],), (4, 15, 10, 5)),
(([0, 1, 2], 0, [0, 1, 2]), (3, 3, 5)),
(([[0, 1], [2, 3]], [[0, 1], [2, 3]]), (2, 2, 10, 5)),
]
for idx, y_shps in idxs_y_shp_pairs:
for i in range(len(y_shps) - 1):
y_shp = y_shps[i:]
x_val = np.arange(np.prod(x_shp), dtype='float32').reshape(x_shp) + 1
y_val = np.arange(np.prod(y_shp), dtype='float32').reshape(y_shp) + 1
rep = x_val.copy()
try:
rep[idx] += y_val
except ValueError:
continue
check(idx, y_val, x_val, rep)
x_val = np.arange(np.prod(x_shp), dtype='float32').reshape(x_shp) + 1
y_val = np.array(1).astype(np.float32)
rep = x_val.copy()
rep[idx] += y_val
check(idx, y_val, x_val, rep)
def eval_output_and_check(self, t): def eval_output_and_check(self, t):
f = inplace_func([], t, mode=self.mode) f = inplace_func([], t, mode=self.mode)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论