提交 85faa716 authored 作者: Frederic's avatar Frederic

Port sync

上级 c908977c
...@@ -793,7 +793,10 @@ class GpuCAReduce(GpuOp): ...@@ -793,7 +793,10 @@ class GpuCAReduce(GpuOp):
(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/4 PyGpuArray_STRIDES(%(z)s)[0]/4
); );
CNDA_THREAD_SYNC; [
if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
]
if (cudaSuccess != cudaGetLastError()) if (cudaSuccess != cudaGetLastError())
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: ... ); PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
...@@ -841,10 +844,12 @@ class GpuCAReduce(GpuOp): ...@@ -841,10 +844,12 @@ class GpuCAReduce(GpuOp):
print >> sio, """ print >> sio, """
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4 ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
print >> sio, """ print >> sio, """
); );
CNDA_THREAD_SYNC; %(sync)s
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
...@@ -1273,6 +1278,9 @@ class GpuCAReduce(GpuOp): ...@@ -1273,6 +1278,9 @@ class GpuCAReduce(GpuOp):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail): def c_code_reduce_10(self, sio, node, name, x, z, fail):
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
print >> sio, """ print >> sio, """
{ {
int verbose = 0; int verbose = 0;
...@@ -1302,7 +1310,7 @@ class GpuCAReduce(GpuOp): ...@@ -1302,7 +1310,7 @@ class GpuCAReduce(GpuOp):
1, 1,
PyGpuArray_STRIDES(%(z)s)[0]/4 PyGpuArray_STRIDES(%(z)s)[0]/4
); );
CNDA_THREAD_SYNC; %(sync)s
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
...@@ -1326,6 +1334,9 @@ class GpuCAReduce(GpuOp): ...@@ -1326,6 +1334,9 @@ class GpuCAReduce(GpuOp):
makecall_inner = self._makecall(node, name, x, z, fail, makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner") pattern="010_inner")
pattern = ''.join(str(i) for i in self.reduce_mask) pattern = ''.join(str(i) for i in self.reduce_mask)
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
print >> sio, """ print >> sio, """
{ {
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2]; //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
...@@ -1370,7 +1381,7 @@ class GpuCAReduce(GpuOp): ...@@ -1370,7 +1381,7 @@ class GpuCAReduce(GpuOp):
PyGpuArray_STRIDES(%(z)s)[0]/4, PyGpuArray_STRIDES(%(z)s)[0]/4,
PyGpuArray_STRIDES(%(z)s)[1]/4 PyGpuArray_STRIDES(%(z)s)[1]/4
); );
CNDA_THREAD_SYNC; %(sync)s
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
...@@ -1425,7 +1436,7 @@ class GpuCAReduce(GpuOp): ...@@ -1425,7 +1436,7 @@ class GpuCAReduce(GpuOp):
); );
%(makecall)s %(makecall)s
} }
CNDA_THREAD_SYNC; %(sync)s
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论