提交 b7b5dd39 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6220 from lamblin/fix_gpuadvidx

Updates in GPU indexing
...@@ -1106,9 +1106,11 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs): ...@@ -1106,9 +1106,11 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
@register_opt('fast_compile') # Do not register this optimization for now, as it slows down the
@op_lifter([tensor.AdvancedIncSubtensor]) # execution by a lot in important cases.
@register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile') # @register_opt('fast_compile')
# @op_lifter([tensor.AdvancedIncSubtensor])
# @register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs): def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
if not op.set_instead_of_inc: if not op.set_instead_of_inc:
return GpuAdvancedIncSubtensor() return GpuAdvancedIncSubtensor()
......
...@@ -621,6 +621,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor): ...@@ -621,6 +621,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
y = inp[1] y = inp[1]
idx = inp[2:] idx = inp[2:]
x = x.copy() x = x.copy()
# Get a handle to the GpuElemwise object that will be called.
# It is not necessary to have the right number of dimensions,
# so we just pass symbolic x and y.
iadd = get_iadd(node.inputs[0], node.inputs[1])
# convert all indices to np.array # convert all indices to np.array
for i in range(len(idx)): for i in range(len(idx)):
...@@ -699,15 +703,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor): ...@@ -699,15 +703,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
else: else:
val = y_flat[j] val = y_flat[j]
tmp = pygpu.elemwise.elemwise2( iadd(x_flat[i], val, broadcast=True)
x_flat[i], '+', val, x_flat[i],
broadcast=True,
convert_f16=True
)
x_flat.__setitem__(i, tmp)
else: else:
k = get_iadd(node.inputs[0], node.inputs[1]) if (x_flat.shape[-len(y_flat.shape):] == y_flat.shape or
if x_flat.shape[-len(y_flat.shape):] == y_flat.shape or y_flat.shape == (): y_flat.shape == ()):
# y_flat has to be broadcast over axes of x_flat[i] # y_flat has to be broadcast over axes of x_flat[i]
for i in take_idx.flatten(): for i in take_idx.flatten():
...@@ -715,13 +714,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor): ...@@ -715,13 +714,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
x_flat_sub = x_flat[i].__getitem__(index) x_flat_sub = x_flat[i].__getitem__(index)
else: else:
x_flat_sub = x_flat[i] x_flat_sub = x_flat[i]
tmp = pygpu.elemwise.elemwise2( iadd(x_flat_sub, y_flat, broadcast=True)
x_flat_sub, '+', y_flat, x_flat_sub,
broadcast=True,
convert_f16=True
)
x_flat[i].__setitem__(index, tmp)
else: else:
# y_flat's first axis corresponds to first exist of x_flat # y_flat's first axis corresponds to first exist of x_flat
for j, i in enumerate(take_idx.flatten()): for j, i in enumerate(take_idx.flatten()):
...@@ -729,7 +722,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor): ...@@ -729,7 +722,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
x_flat_sub = x_flat[i].__getitem__(index) x_flat_sub = x_flat[i].__getitem__(index)
else: else:
x_flat_sub = x_flat[i] x_flat_sub = x_flat[i]
k(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True) iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
x_ = x_flat.reshape(x_.shape).transpose(*rtransp) x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
out[0] = x_ out[0] = x_
......
...@@ -340,8 +340,8 @@ class test_gpuallocdiag(unittest.TestCase): ...@@ -340,8 +340,8 @@ class test_gpuallocdiag(unittest.TestCase):
grad_x = tensor.grad(sum_mtx_x, x) grad_x = tensor.grad(sum_mtx_x, x)
grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x) grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
fn_grad_x = theano.function([x], grad_x) fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = theano.function([x], grad_mtx_x) fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x) computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x)
...@@ -354,8 +354,8 @@ class test_gpuallocdiag(unittest.TestCase): ...@@ -354,8 +354,8 @@ class test_gpuallocdiag(unittest.TestCase):
grad_x = tensor.grad(sum_mtx_x, x) grad_x = tensor.grad(sum_mtx_x, x)
grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x) grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
fn_grad_x = theano.function([x], grad_x) fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = theano.function([x], grad_mtx_x) fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x) computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x)
...@@ -368,8 +368,8 @@ class test_gpuallocdiag(unittest.TestCase): ...@@ -368,8 +368,8 @@ class test_gpuallocdiag(unittest.TestCase):
grad_x = tensor.grad(sum_mtx_x, x) grad_x = tensor.grad(sum_mtx_x, x)
grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x) grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
fn_grad_x = theano.function([x], grad_x) fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
fn_grad_mtx_x = theano.function([x], grad_mtx_x) fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
computed_grad_x = fn_grad_x(np_x) computed_grad_x = fn_grad_x(np_x)
computed_grad_mtx_x = fn_grad_mtx_x(np_x) computed_grad_mtx_x = fn_grad_mtx_x(np_x)
......
...@@ -396,7 +396,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -396,7 +396,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
s1 = s[newaxis] s1 = s[newaxis]
assert s1.broadcastable == (True,), s1 assert s1.broadcastable == (True,), s1
vs1, vn3, vn4 = theano.function([s], [s1, n3, n4])(-2.0) vs1, vn3, vn4 = theano.function([s], [s1, n3, n4], mode=self.mode)(-2.0)
assert np.all(vs1 == [-2.0]) assert np.all(vs1 == [-2.0])
assert np.all(vn3 == assert np.all(vn3 ==
...@@ -962,12 +962,14 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -962,12 +962,14 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
utt.verify_grad( utt.verify_grad(
inc_slice(slice(1, 2, None), slice(None, None, None)), inc_slice(slice(1, 2, None), slice(None, None, None)),
(np.asarray([[0, 1], [2, 3], [4, 5.]]), (np.asarray([[0, 1], [2, 3], [4, 5.]]),
np.asarray([[9, 9.]]),)) np.asarray([[9, 9.]]),),
mode=self.mode)
# single element # single element
utt.verify_grad( utt.verify_grad(
inc_slice(2, 1), inc_slice(2, 1),
(np.asarray([[0, 1], [2, 3], [4, 5.]]), np.asarray(9.),)) (np.asarray([[0, 1], [2, 3], [4, 5.]]), np.asarray(9.),),
mode=self.mode)
def test_inc_and_set_subtensor(self): def test_inc_and_set_subtensor(self):
""" """
...@@ -1142,7 +1144,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -1142,7 +1144,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
m1 = set_subtensor(m[:, i], 0) m1 = set_subtensor(m[:, i], 0)
m2 = inc_subtensor(m[:, i], 1) m2 = inc_subtensor(m[:, i], 1)
f = theano.function([m, i], [m1, m2]) f = theano.function([m, i], [m1, m2], mode=self.mode)
m_val = rand(3, 5) m_val = rand(3, 5)
i_val = randint_ranged(min=0, max=4, shape=(4,)) i_val = randint_ranged(min=0, max=4, shape=(4,))
...@@ -1167,7 +1169,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -1167,7 +1169,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
m1 = set_subtensor(m[:, i], 0) m1 = set_subtensor(m[:, i], 0)
m2 = inc_subtensor(m[:, i], 1) m2 = inc_subtensor(m[:, i], 1)
f = theano.function([m, i], [m1, m2]) f = theano.function([m, i], [m1, m2], mode=self.mode)
m_val = rand(5, 7) m_val = rand(5, 7)
i_val = randint_ranged(min=0, max=6, shape=(4, 2)) i_val = randint_ranged(min=0, max=6, shape=(4, 2))
...@@ -1202,7 +1204,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -1202,7 +1204,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
sub_m = m[:, i] sub_m = m[:, i]
m1 = set_subtensor(sub_m, np.zeros(shp_v)) m1 = set_subtensor(sub_m, np.zeros(shp_v))
m2 = inc_subtensor(sub_m, np.ones(shp_v)) m2 = inc_subtensor(sub_m, np.ones(shp_v))
f = theano.function([m, i], [m1, m2]) f = theano.function([m, i], [m1, m2], mode=self.mode)
m_val = rand(3, 5) m_val = rand(3, 5)
i_val = randint_ranged(min=0, max=4, shape=shp_i) i_val = randint_ranged(min=0, max=4, shape=shp_i)
...@@ -1239,7 +1241,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -1239,7 +1241,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
sub_m = m[:, i] sub_m = m[:, i]
m1 = set_subtensor(sub_m, np.zeros(shp_v)) m1 = set_subtensor(sub_m, np.zeros(shp_v))
m2 = inc_subtensor(sub_m, np.ones(shp_v)) m2 = inc_subtensor(sub_m, np.ones(shp_v))
f = theano.function([m, i], [m1, m2]) f = theano.function([m, i], [m1, m2], mode=self.mode)
m_val = rand(3, 5) m_val = rand(3, 5)
i_val = randint_ranged(min=0, max=4, shape=shp_i) i_val = randint_ranged(min=0, max=4, shape=shp_i)
...@@ -1261,7 +1263,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -1261,7 +1263,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test_take(self): def test_take(self):
a = tensor.matrix() a = tensor.matrix()
f = theano.function([a], a.take(0, axis=-1), allow_input_downcast=True) f = theano.function(
[a], a.take(0, axis=-1),
allow_input_downcast=True, mode=self.mode)
f(np.random.normal(0, 1, (30, 4))) f(np.random.normal(0, 1, (30, 4)))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论