提交 0751f771 authored 作者: lamblin's avatar lamblin

Merge pull request #1480 from nouiz/crash_gpu_reduce_1111

Crash gpu reduce 1111
...@@ -767,6 +767,10 @@ class GpuCAReduce(GpuOp): ...@@ -767,6 +767,10 @@ class GpuCAReduce(GpuOp):
pattern = ''.join(str(c) for c in self.reduce_mask) pattern = ''.join(str(c) for c in self.reduce_mask)
ndim = len(self.reduce_mask) ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask)
shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)])
print >> sio, """ print >> sio, """
if (verbose) if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n"); printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
...@@ -774,11 +778,11 @@ class GpuCAReduce(GpuOp): ...@@ -774,11 +778,11 @@ class GpuCAReduce(GpuOp):
if (verbose>1) if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d," printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d," " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
" nb_block=%%d, n_shared=%%d\\n", " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
n_threads.x,n_threads.y,n_threads.z, n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z, n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y, n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared); n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
...@@ -800,9 +804,6 @@ class GpuCAReduce(GpuOp): ...@@ -800,9 +804,6 @@ class GpuCAReduce(GpuOp):
,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" % locals() """ % locals()
shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%s)[%d]" % (x, i)
for i in range(node.inputs[0].ndim)])
print >> sio, """ print >> sio, """
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -1571,6 +1572,9 @@ class GpuCAReduce(GpuOp): ...@@ -1571,6 +1572,9 @@ class GpuCAReduce(GpuOp):
} }
n_threads.z -= 1; n_threads.z -= 1;
//Maximum for Fermi GPU on that dimensions.
n_threads.z = std::min(n_threads.z, (unsigned)64);
dim3 n_blocks(1,1,1); dim3 n_blocks(1,1,1);
%(makecall)s %(makecall)s
} }
...@@ -1601,7 +1605,7 @@ class GpuCAReduce(GpuOp): ...@@ -1601,7 +1605,7 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [7] # the version corresponding to the c code in this Op version = [8] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
...@@ -114,7 +114,7 @@ def test_careduce(): ...@@ -114,7 +114,7 @@ def test_careduce():
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
#test pattern implemented by reshape #test pattern implemented by reshape
...@@ -197,7 +197,9 @@ def test_careduce(): ...@@ -197,7 +197,9 @@ def test_careduce():
for shape, pattern in [#((5,),[0]), for shape, pattern in [#((5,),[0]),
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),
((128,1,3,3),[0,1,2,3]),
]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
...@@ -229,7 +231,9 @@ def test_careduce(): ...@@ -229,7 +231,9 @@ def test_careduce():
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]), ((5,4,3),[0]),((5,4,3),[0,1]),
((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),
((128,1,3,3),[0,1,2,3]),
]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论