提交 b7e9de0a authored 作者: lamblin's avatar lamblin

Merge pull request #1143 from nouiz/gpu_sum_crash_fix

Gpu sum crash fix
...@@ -782,6 +782,10 @@ class GpuCAReduce(GpuOp): ...@@ -782,6 +782,10 @@ class GpuCAReduce(GpuOp):
print >> sio, """ print >> sio, """
,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" % locals() """ % locals()
shapes_format = "shape=(%s)" % ",".join(["%d"] * node.inputs[0].ndim)
shapes_data = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals()
for i in range(node.inputs[0].ndim)])
print >> sio, """ print >> sio, """
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -790,14 +794,16 @@ class GpuCAReduce(GpuOp): ...@@ -790,14 +794,16 @@ class GpuCAReduce(GpuOp):
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s." "Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", " (grid: %%i x %%i; block: %%i x %%i x %%i)"
" %(shapes_format)s \\n",
"kernel_reduce_%(pattern)s_%(name)s", "kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
n_threads.x, n_threads.x,
n_threads.y, n_threads.y,
n_threads.z); n_threads.z,
%(shapes_data)s);
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
...@@ -1382,7 +1388,7 @@ class GpuCAReduce(GpuOp): ...@@ -1382,7 +1388,7 @@ class GpuCAReduce(GpuOp):
dim3 n_threads( dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], NUM_VECTOR_OP_BLOCKS));
while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2]) while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
{ {
n_blocks.y += 1; n_blocks.y += 1;
...@@ -1559,7 +1565,7 @@ class GpuCAReduce(GpuOp): ...@@ -1559,7 +1565,7 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [6] # the version corresponding to the c code in this Op version = [7] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
...@@ -95,7 +95,7 @@ def test_careduce(): ...@@ -95,7 +95,7 @@ def test_careduce():
((1024,33),[0,1]),((33,1024),[0,1]),#01 ((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01 ((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]), ((3,65536,1), [0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论