提交 85ee3d3e authored 作者: Ian Goodfellow's avatar Ian Goodfellow

removed 'sum' from name of reduce kernels

上级 1ca702ef
...@@ -687,9 +687,9 @@ class GpuCAReduce(GpuOp): ...@@ -687,9 +687,9 @@ class GpuCAReduce(GpuOp):
.. code-block:: c .. code-block:: c
if (verbose) if (verbose)
printf("running kernel_reduce_sum_10_%(name)s\\n"); printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>( n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
...@@ -713,7 +713,7 @@ class GpuCAReduce(GpuOp): ...@@ -713,7 +713,7 @@ class GpuCAReduce(GpuOp):
nd_out = ndim - sum(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask)
print >> sio, """ print >> sio, """
if (verbose) if (verbose)
printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1) if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d," printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp): ...@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):
n_threads.x*n_threads.y*n_threads.z, n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y, n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared); n_blocks.x*n_blocks.y, n_shared);
kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
...@@ -752,7 +752,7 @@ class GpuCAReduce(GpuOp): ...@@ -752,7 +752,7 @@ class GpuCAReduce(GpuOp):
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s." "Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_%(pattern)s_%(name)s", "kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -769,9 +769,11 @@ class GpuCAReduce(GpuOp): ...@@ -769,9 +769,11 @@ class GpuCAReduce(GpuOp):
self._op_guard() self._op_guard()
"""Return a string to declare a kernel function """Return a string to declare a kernel function
The result will look something like this:
.. code-block:: c .. code-block:: c
static __global__ void kernel_reduce_sum_110_%(nodename)s( static __global__ void kernel_reduce_110_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const int d2, const int d2,
...@@ -792,7 +794,7 @@ class GpuCAReduce(GpuOp): ...@@ -792,7 +794,7 @@ class GpuCAReduce(GpuOp):
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s( static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
""" % locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
...@@ -965,11 +967,11 @@ class GpuCAReduce(GpuOp): ...@@ -965,11 +967,11 @@ class GpuCAReduce(GpuOp):
std::min(CudaNdarray_SIZE(%(x)s), std::min(CudaNdarray_SIZE(%(x)s),
NUM_VECTOR_OP_THREADS_PER_BLOCK)); NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1); dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s" if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads.x=%%d, size=%%d, ndim=%%d\\n", " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd); n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_SIZE(%(x)s), CudaNdarray_SIZE(%(x)s),
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s)); CudaNdarray_DEV_DATA(%(z)s));
...@@ -980,7 +982,7 @@ class GpuCAReduce(GpuOp): ...@@ -980,7 +982,7 @@ class GpuCAReduce(GpuOp):
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s." "Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_ccontig_%(name)s", "kernel_reduce_ccontig_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -1031,9 +1033,9 @@ class GpuCAReduce(GpuOp): ...@@ -1031,9 +1033,9 @@ class GpuCAReduce(GpuOp):
:param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111 :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3 Work for N=1,2,3
""" """
self._op_guard()
assert N in [1, 2, 3] assert N in [1, 2, 3]
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
self._op_guard()
N_pattern = ''.join(['1'] * N) N_pattern = ''.join(['1'] * N)
param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals() param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals()
for i in xrange(N + 1)]) for i in xrange(N + 1)])
...@@ -1100,13 +1102,13 @@ class GpuCAReduce(GpuOp): ...@@ -1100,13 +1102,13 @@ class GpuCAReduce(GpuOp):
NUM_VECTOR_OP_BLOCKS)); NUM_VECTOR_OP_BLOCKS));
if (verbose) { if (verbose) {
fprintf(stderr, fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n", "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x, n_blocks.x,
n_blocks.y); n_blocks.y);
} }
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]); assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1, 1,
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
...@@ -1125,7 +1127,7 @@ class GpuCAReduce(GpuOp): ...@@ -1125,7 +1127,7 @@ class GpuCAReduce(GpuOp):
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s." "Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s", "kernel_reduce_010_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -1177,7 +1179,7 @@ class GpuCAReduce(GpuOp): ...@@ -1177,7 +1179,7 @@ class GpuCAReduce(GpuOp):
if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS; if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x; if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
int n_shared = 0; int n_shared = 0;
kernel_reduce_sum_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D, A,B,C,D,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -1194,7 +1196,7 @@ class GpuCAReduce(GpuOp): ...@@ -1194,7 +1196,7 @@ class GpuCAReduce(GpuOp):
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s." "Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s", "kernel_reduce_010_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -1247,7 +1249,7 @@ class GpuCAReduce(GpuOp): ...@@ -1247,7 +1249,7 @@ class GpuCAReduce(GpuOp):
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_%(pattern)s_%(name)s", "kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
n_blocks.y, n_blocks.y,
...@@ -1487,7 +1489,7 @@ class GpuCAReduce(GpuOp): ...@@ -1487,7 +1489,7 @@ class GpuCAReduce(GpuOp):
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_ccontig_%(nodename)s( static __global__ void kernel_reduce_ccontig_%(nodename)s(
const unsigned int d0, const unsigned int d0,
const float *A, const float *A,
float * Z) float * Z)
...@@ -1514,7 +1516,7 @@ class GpuCAReduce(GpuOp): ...@@ -1514,7 +1516,7 @@ class GpuCAReduce(GpuOp):
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s( static __global__ void kernel_reduce_1_%(nodename)s(
const unsigned int d0, const unsigned int d0,
const float *A, const int sA0, const float *A, const int sA0,
float * Z) float * Z)
...@@ -1542,7 +1544,7 @@ class GpuCAReduce(GpuOp): ...@@ -1542,7 +1544,7 @@ class GpuCAReduce(GpuOp):
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_11_%(nodename)s( static __global__ void kernel_reduce_11_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const float *A, const int sA0, const int sA1, const float *A, const int sA0, const int sA1,
...@@ -1623,7 +1625,7 @@ class GpuCAReduce(GpuOp): ...@@ -1623,7 +1625,7 @@ class GpuCAReduce(GpuOp):
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_010_%(nodename)s( static __global__ void kernel_reduce_010_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const int d2, const int d2,
...@@ -1658,7 +1660,7 @@ class GpuCAReduce(GpuOp): ...@@ -1658,7 +1660,7 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0): if self.reduce_mask == (0, 1, 0):
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_010_AD_%(nodename)s( static __global__ void kernel_reduce_010_AD_%(nodename)s(
const int A, const int A,
const int B, const int B,
const int C, const int C,
...@@ -1746,7 +1748,7 @@ class GpuCAReduce(GpuOp): ...@@ -1746,7 +1748,7 @@ class GpuCAReduce(GpuOp):
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]') reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_110_%(nodename)s( static __global__ void kernel_reduce_110_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const int d2, const int d2,
...@@ -1827,7 +1829,7 @@ class GpuCAReduce(GpuOp): ...@@ -1827,7 +1829,7 @@ class GpuCAReduce(GpuOp):
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_001_%(nodename)s( static __global__ void kernel_reduce_001_%(nodename)s(
const int d0, const int d0,
const int d1, const int d1,
const int d2, const int d2,
...@@ -1940,7 +1942,7 @@ class GpuCAReduce(GpuOp): ...@@ -1940,7 +1942,7 @@ class GpuCAReduce(GpuOp):
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]') reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s( static __global__ void kernel_reduce_1011_%(nodename)s(
const unsigned int d0, const unsigned int d0,
const unsigned int d1, const unsigned int d1,
const unsigned int d2, const unsigned int d2,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论