提交 4ed822e5 authored 作者: Frederic's avatar Frederic

Implement 101 reduce in the new back-end by reusing 1011 kernelinside the op

上级 c62451be
...@@ -833,7 +833,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -833,7 +833,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return sio.getvalue() return sio.getvalue()
def _makecall(self, node, name, x, z, fail, pattern=None): def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()):
""" """
Return a string for making a kernel call. Return a string for making a kernel call.
...@@ -876,6 +876,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -876,6 +876,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
for i in xrange(ndim): for i in xrange(ndim):
params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals()) params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
for declaration, value in extra_dims:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(x)s->ga.data" % locals()) params.append("(void *)%(x)s->ga.data" % locals())
params.append("(void *)&%(x)s->ga.offset" % locals()) params.append("(void *)&%(x)s->ga.offset" % locals())
for i in xrange(ndim): for i in xrange(ndim):
...@@ -883,6 +886,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -883,6 +886,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s); ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
""" % locals(), file=sio) """ % locals(), file=sio)
params.append("(void *)&stride_A%(i)d" % locals()) params.append("(void *)&stride_A%(i)d" % locals())
for declaration, value in extra_strides:
print(declaration % locals(), file=sio)
params.append(value)
params.append("(void *)%(z)s->ga.data" % locals()) params.append("(void *)%(z)s->ga.data" % locals())
params.append("(void *)&%(z)s->ga.offset" % locals()) params.append("(void *)&%(z)s->ga.offset" % locals())
...@@ -1779,6 +1785,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1779,6 +1785,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_reduce_101(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail,
extra_dims=[("size_t one = 1;", "(void *) &one")],
extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
pattern="1011")
print("""
{
int verbose = 0;
// size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
// (size_t) 256), 1, 1};
size_t n_threads[3] = {1, 1, 1};
while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
++n_threads[2];
if (n_threads[2] > 64)
n_threads[2] = 64;
if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
%(makecall)s
}
""" % locals(), file=sio)
def c_code_reduce_111(self, sio, node, name, x, z, fail): def c_code_reduce_111(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print(""" print("""
...@@ -2572,7 +2606,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2572,7 +2606,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
kernels.append(Kernel(code=sio.getvalue(), name=kname, kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var)) params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub={}) node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult", reduce_fct = self._assign_reduce(node, nodename, "myresult",
......
...@@ -172,16 +172,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -172,16 +172,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001 ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110 ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011 ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
# ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]), #101
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111 ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111 ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
# reduce over 2d
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011 ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101 ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
#((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 by reshape
#((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 by reshape
#((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
#((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 by reshape
# reduce over 3d
# 3d not tested: 1101, 1110, 1111
# ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 by reshape
# ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 by reshape
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011 ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111 ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111 ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 2, 3), [0, 1, 2, 3]), # 1111
# reduce over 4d
((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), # 1111
# test pattern implemented by reshape # test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape # Skip them as this test the op directly, not the optimization with reshape
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论