提交 ab0d35c6 authored 作者: Frederic's avatar Frederic

code clean up.

上级 d56c3079
...@@ -1766,23 +1766,12 @@ class GpuCAReduce(GpuOp): ...@@ -1766,23 +1766,12 @@ class GpuCAReduce(GpuOp):
for i in xrange(nd_in)]) for i in xrange(nd_in)])
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
# TODO: ideally this would all be some clean function of scalar_op, reduce_init = self._assign_init("A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0]" % locals())
# but since sum is a special case where it's OK to reduce with an reduce_fct = self._assign_reduce(
# extra 0, I would need to change the behavior of the sum reduction node, nodename, "myresult",
# code to do that. I don't want to benchmark and test changes to the "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
# sum code so I will leave that for later. {})
# max/min reduction is also a special case that is simple to implement. print >> sio, """
# this is the special case where reduction is idempotent so it doesn't
# matter if we reduce with the first element multiple times.
if True:
# special cased max/min code (special case because visits first
# member of each row twice)
reduce_init = self._assign_init("A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0]" % locals())
reduce_fct = self._assign_reduce(
node, nodename, "myresult",
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
{})
print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
...@@ -1798,17 +1787,6 @@ class GpuCAReduce(GpuOp): ...@@ -1798,17 +1787,6 @@ class GpuCAReduce(GpuOp):
} }
} }
""" % locals() """ % locals()
else:
# TODO: implement general case and get rid of the two special
# cases above
# it should initialize myresult to element 0,
# and the for loop should begin traversing from element 1
# raise an error if asked to reduce an empty dimension
# (maybe special-case sum to return 0 instead of returning an
# error)
# in both cases, benchmark the general case against the existing
# code to make sure it does not cause a slowdown
raise NotImplementedError()
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论