提交 1780d27f authored 作者: Frederic's avatar Frederic

Finish implementing GPU 010 min/max.

This probably fix a the bug in the not enabled implementation: 010_inner
上级 2ccb5948
...@@ -1885,7 +1885,10 @@ class GpuCAReduce(GpuOp): ...@@ -1885,7 +1885,10 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
# #
# This kernel is optimized when the inner most dimensions # This kernel is optimized when the inner most dimensions
# have the smallest stride. # have the smallest stride.
...@@ -1900,9 +1903,15 @@ class GpuCAReduce(GpuOp): ...@@ -1900,9 +1903,15 @@ class GpuCAReduce(GpuOp):
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner") decl = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
node, nodename,
'blockDim.x') 'blockDim.x')
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reduce_fct = self._assign_reduce(node, nodename, "myresult",
'blockDim.x') "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i0 * sA0 + 0 * sA1 + i2 * sA2];"
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
{ {
...@@ -1917,9 +1926,10 @@ class GpuCAReduce(GpuOp): ...@@ -1917,9 +1926,10 @@ class GpuCAReduce(GpuOp):
{ {
for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x) for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
{ {
myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; %(reduce_fct)s;
} }
%(reducebuf)s %(reducebuf)s
} }
......
...@@ -127,7 +127,9 @@ def test_careduce(): ...@@ -127,7 +127,9 @@ def test_careduce():
#GpuCAReduce{maximum/minimum} support only those patterns #GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op in [theano.scalar.maximum, if scalar_op in [theano.scalar.maximum,
theano.scalar.minimum] and pat not in [ theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]: (0, 1), (0, 1, 1), (0, 1, 1), (1, 0),
(1, 0, 0, 0), (0, 1, 0, 0),
(0, 0, 1, 0), (0, 0, 0, 1)]:
continue continue
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论