提交 2ccb5948 authored 作者: Frederic's avatar Frederic

Partial enable of GPU max/min for 010 pattern, (and so 1000, 0100, 0010 and 0001) by reshaping.

上级 0bb60e21
...@@ -1026,8 +1026,8 @@ class GpuCAReduce(GpuOp): ...@@ -1026,8 +1026,8 @@ class GpuCAReduce(GpuOp):
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize #nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, nb_reduce): def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
self._op_guard() reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {})
return """ return """
__syncthreads(); // some kernel do multiple reduction. __syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = myresult; buf[threadNum] = myresult;
...@@ -1039,7 +1039,7 @@ class GpuCAReduce(GpuOp): ...@@ -1039,7 +1039,7 @@ class GpuCAReduce(GpuOp):
//round up all the partial sums into the first `nb_reduce` elements //round up all the partial sums into the first `nb_reduce` elements
for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s) for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
{ {
myresult += buf[i]; %(reduce_fct)s;
} }
%(z_pos)s = myresult; %(z_pos)s = myresult;
} }
...@@ -1242,7 +1242,6 @@ class GpuCAReduce(GpuOp): ...@@ -1242,7 +1242,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_010(self, sio, node, name, x, z, fail): def c_code_reduce_010(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
makecall_inner = self._makecall(node, name, x, z, fail, makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner") pattern="010_inner")
...@@ -1835,7 +1834,17 @@ class GpuCAReduce(GpuOp): ...@@ -1835,7 +1834,17 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"X[a * sX0 + b * sX1 + c * sX2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "X[a * sX0 + 0 * sX1 + c * sX2];"
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_AD_%(nodename)s( static __global__ void kernel_reduce_010_AD_%(nodename)s(
const int A, const int A,
...@@ -1863,10 +1872,10 @@ class GpuCAReduce(GpuOp): ...@@ -1863,10 +1872,10 @@ class GpuCAReduce(GpuOp):
int c = i2_D * 32 + threadIdx.x; int c = i2_D * 32 + threadIdx.x;
if (c < C) if (c < C)
{ {
myresult = 0; myresult = %(reduce_init)s;
for (int b = 0; b < B; ++b) for (int b = 0; b < B; ++b)
{ {
myresult += X[a * sX0 + b * sX1 + c * sX2]; %(reduce_fct)s;
} }
Z[a * sZ0 + c * sZ1] = myresult; Z[a * sZ0 + c * sZ1] = myresult;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论