提交 a240803c authored 作者: Frederic's avatar Frederic

[BUG] fix 2 bug in the new faster GpuAdvancedIncSubtensor1

It was always running inplace and it was alwayr considering the input as 2d.
上级 32e5fa85
...@@ -2451,13 +2451,14 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2451,13 +2451,14 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
out[0] = x out[0] = x
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = theano.sandbox.cuda.device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (self.set_instead_of_inc) or \ if (self.set_instead_of_inc) or \
(node.inputs[0].ndim != node.inputs[1].ndim) or \ (node.inputs[0].ndim != node.inputs[1].ndim) or \
(node.inputs[0].ndim != 2) or \
(compute_capability < 2): (compute_capability < 2):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
...@@ -2477,12 +2478,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -2477,12 +2478,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
Py_XINCREF(%(out)s); Py_XINCREF(%(out)s);
} }
CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s); CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
if (!%(out)s) { if (!%(out)s) {
%(fail)s %(fail)s
} }
""" %locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
......
...@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
shared = cuda.shared_constructor for shp in [(3, 3), (3, 3, 3)]:
#shared = tensor.shared shared = cuda.shared_constructor
xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
dtype='float32') yval = numpy.empty((2,) + shp[1:], dtype='float32')
yval = numpy.asarray([[10, 10, 10], [10, 10, 10]], yval[:] = 10
dtype='float32') x = shared(xval, name='x')
x = shared(xval, name='x') y = T.tensor(dtype='float32',
y = T.fmatrices('y') broadcastable=(False,) * len(shp),
expr = T.advanced_inc_subtensor1(x, y, [0, 2]) name='y')
f = theano.function([y], expr, mode=mode_with_gpu) expr = T.advanced_inc_subtensor1(x, y, [0, 2])
assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1) f = theano.function([y], expr, mode=mode_with_gpu)
for node in f.maker.fgraph.toposort()]) == 1 assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.], for node in f.maker.fgraph.toposort()]) == 1
[17., 18., 19.]]) rval = f(yval)
rep = xval.copy()
rep[[0, 2]] += yval
assert numpy.allclose(rval, rep)
def test_inc_subtensor(): def test_inc_subtensor():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论