提交 bc6dfee6 authored 作者: Frederic's avatar Frederic

More verbose version of the code.

上级 fc66c3fd
......@@ -767,6 +767,10 @@ class GpuCAReduce(GpuOp):
pattern = ''.join(str(c) for c in self.reduce_mask)
ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask)
shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)])
print >> sio, """
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
......@@ -774,11 +778,11 @@ class GpuCAReduce(GpuOp):
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
" nb_block=%%d, n_shared=%%d\\n",
" nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared);
n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals()
for i in xrange(ndim):
......@@ -800,9 +804,6 @@ class GpuCAReduce(GpuOp):
,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" % locals()
shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)])
print >> sio, """
);
CNDA_THREAD_SYNC;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论