提交 3943db3c authored 作者: Frederic's avatar Frederic

Make GpuCAReduce support more dtype, but the acc_dtype is the same as input/outputs.

上级 26d91309
......@@ -607,7 +607,6 @@ class GpuCAReduceCuda(HideC, CAReduce):
def make_node(self, x):
x = as_gpuarray_variable(x)
assert x.dtype == "float32"
ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self)
self.axis = ret.op.axis
......@@ -693,7 +692,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
nd_in = node.inputs[0].type.ndim
nd_out = node.outputs[0].type.ndim
dtype = "npy_" + node.outputs[0].dtype
assert node.inputs[0].dtype == node.outputs[0].dtype
assert nd_in - nd_out == sum(self.reduce_mask)
sio = StringIO()
......@@ -775,7 +775,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
# \begin bracket the reduction in a check that there is
# actually work to do
if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
zero_shp = "cudaMemset((%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(dtype)s))" % locals()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else:
scalar_op = self.scalar_op
......@@ -827,16 +827,16 @@ class GpuCAReduceCuda(HideC, CAReduce):
if (verbose)
printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/4,
PyGpuArray_STRIDES(%(x)s)[1]/4,
(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/4
(%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s)
);
[
if config.gpuarray.sync:
......@@ -848,6 +848,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
%(fail)s;
}
"""
dtype = "npy_" + node.outputs[0].dtype
sio = StringIO()
if pattern is None:
pattern = ''.join(str(c) for c in self.reduce_mask)
......@@ -860,7 +861,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
print >> sio, """
if (verbose)
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
int n_shared = sizeof(%(dtype)s) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
" nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
......@@ -876,18 +877,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
PyGpuArray_DIMS(%(x)s)[%(i)s],
""" % locals()
print >> sio, """
(float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
(%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
""" % locals()
for i in xrange(ndim):
print >> sio, """
,PyGpuArray_STRIDES(%(x)s)[%(i)s]/4
,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(dtype)s)
""" % locals()
print >> sio, """
,(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
,(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
""" % locals()
for i in xrange(nd_out):
print >> sio, """
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4
,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(dtype)s)
""" % locals()
sync = ""
if config.gpuarray.sync:
......@@ -927,17 +928,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int d0,
const int d1,
const int d2,
const float *A,
const %(dtype)s *A,
const int sA0,
const int sA1,
const int sA2,
float * Z,
%(dtype)s * Z,
const int sZ0)
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
"""
dtype = "npy_" + node.outputs[0].dtype
if reduce_mask is None:
reduce_mask = self.reduce_mask
if ndim is None:
......@@ -954,14 +956,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int d%(i)s,
""" % locals()
print >> sio, """
const float *A,
const %(dtype)s *A,
""" % locals()
for i in xrange(ndim):
print >> sio, """
const int sA%(i)s,
""" % locals()
print >> sio, """
float * Z
%(dtype)s * Z
""" % locals()
for i in xrange(ndim - sum(reduce_mask)):
print >> sio, """
......@@ -970,13 +972,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
print >> sio, ")"
return sio.getvalue()
def _k_init(self, *args):
def _k_init(self, node, nodename):
dtype = "npy_" + node.outputs[0].dtype
return """
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult = 0.0f;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = 0.0f;
//This is caught in cuda/init.py when we init the gpu. I keep
//it here to ease finding code that rely on this.
......@@ -986,7 +989,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
return;
}
"""
""" % locals()
def _assign_init(self, first_item):
"""
......@@ -1037,6 +1040,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
node, name, sub: these should be passed through from the original
call to c_code
"""
dtype = "npy_" + node.outputs[0].dtype
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
......@@ -1068,7 +1072,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
float temp = buf[threadNum + halfPoint];
%(dtype)s temp = buf[threadNum + halfPoint];
"""
new_version += self._assign_reduce(node, name,
......@@ -1175,8 +1179,9 @@ class GpuCAReduceCuda(HideC, CAReduce):
is for the case where we are reducing on all axes and x is
C contiguous.
"""
dtype = "npy_" + node.outputs[0].dtype
if getattr(self.scalar_op, 'identity', None) == 0:
zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
zero_shp = "cudaMemset((%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(dtype)s))" % locals()
#TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
else:
zero_shp = """
......@@ -1185,6 +1190,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
%(fail)s;
""" % locals()
dtype = "npy_" + node.outputs[0].dtype
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
......@@ -1202,11 +1208,11 @@ class GpuCAReduceCuda(HideC, CAReduce):
" n_threads.x=%%d, size=%%d, ndim=%%d\\n",
n_threads.x,PyGpuArray_SIZE(%(x)s),
PyGpuArray_NDIM(%(x)s));
int n_shared = sizeof(float) * n_threads.x;
int n_shared = sizeof(%(dtype)s) * n_threads.x;
kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
PyGpuArray_SIZE(%(x)s),
(float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
(%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
%(sync)s
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
......@@ -1265,12 +1271,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
"""
assert N in [1, 2, 3]
dtype = "npy_" + node.outputs[0].dtype
makecall = self._makecall(node, name, x, z, fail)
N_pattern = ''.join(['1'] * N)
param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
for i in xrange(N + 1)])
strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/4"
% (x, i) for i in xrange(N + 1)])
strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)"
% (x, i, dtype) for i in xrange(N + 1)])
threads_y = """
//get as many y threads as we can fit
......@@ -1326,6 +1333,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail):
dtype = "npy_" + node.outputs[0].dtype
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
......@@ -1345,18 +1353,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
n_blocks.y);
}
assert( PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x;
int n_shared = sizeof(%(dtype)s) * n_threads.x;
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1,
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
(%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
1,
PyGpuArray_STRIDES(%(x)s)[0]/4,
PyGpuArray_STRIDES(%(x)s)[1]/4,
(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
1,
PyGpuArray_STRIDES(%(z)s)[0]/4
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s)
);
%(sync)s
cudaError_t sts = cudaGetLastError();
......@@ -1382,6 +1390,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner")
pattern = ''.join(str(i) for i in self.reduce_mask)
dtype = "npy_" + node.outputs[0].dtype
sync = ""
if config.gpuarray.sync:
sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
......@@ -1421,13 +1430,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
int n_shared = 0;
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
(float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/4,
PyGpuArray_STRIDES(%(x)s)[1]/4,
PyGpuArray_STRIDES(%(x)s)[2]/4,
(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/4,
PyGpuArray_STRIDES(%(z)s)[1]/4
(%(dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s),
(%(dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(dtype)s)
);
%(sync)s
cudaError_t sts = cudaGetLastError();
......@@ -1464,10 +1473,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
(size_t)n_threads.x),
(size_t)(4096 / n_blocks.x)
);
if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/4,
PyGpuArray_STRIDES(%(x)s)[1]/4),
PyGpuArray_STRIDES(%(x)s)[2]/4)
==PyGpuArray_STRIDES(%(x)s)[2]/4
if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(dtype)s),
PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(dtype)s)),
PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s))
==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(dtype)s)
&& n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
(size_t)n_threads.x)){
if(verbose>1)
......@@ -1623,6 +1632,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
def c_code_reduce_0011(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail)
dtype = "npy_" + node.outputs[0].dtype
print >> sio, """
{
int verbose = 0;
......@@ -1642,7 +1652,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
(size_t) 256));
while (n_threads.x * n_threads.y <= 256
&& n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
&& n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
&& n_threads.x * n_threads.y * sizeof(%(dtype)s) <=(15*1024-200))
{
n_threads.y += 1;
}
......@@ -1711,7 +1721,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
""" % locals()
def c_code_cache_version_apply(self, node):
version = [9] # the version corresponding to the c code in this Op
version = [10] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......@@ -1728,6 +1738,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
def c_support_code_apply(self, node, nodename):
sio = StringIO()
nd_in = len(self.reduce_mask)
dtype = "npy_" + node.outputs[0].dtype
if all(i == 1 for i in self.reduce_mask):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
......@@ -1739,13 +1750,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
print >> sio, """
static __global__ void kernel_reduce_ccontig_%(nodename)s(
const unsigned int d0,
const float *A,
float * Z)
const %(dtype)s *A,
%(dtype)s * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult = %(reduce_init)s;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = %(reduce_init)s;
if (warpSize != 32)
{
......@@ -1770,13 +1781,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
print >> sio, """
static __global__ void kernel_reduce_1_%(nodename)s(
const unsigned int d0,
const float *A, const int sA0,
float * Z)
const %(dtype)s *A, const int sA0,
%(dtype)s * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float myresult = %(reduce_init)s;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = %(reduce_init)s;
if (warpSize != 32)
{
......@@ -1803,13 +1814,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
static __global__ void kernel_reduce_11_%(nodename)s(
const int d0,
const int d1,
const float *A, const int sA0, const int sA1,
float * Z)
const %(dtype)s *A, const int sA0, const int sA1,
%(dtype)s * Z)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult = %(reduce_init)s;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = %(reduce_init)s;
if (warpSize != 32)
{
......@@ -1915,13 +1926,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const %(dtype)s *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0, const int sZ1)
%(dtype)s * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
extern __shared__ %(dtype)s buf[];
if (warpSize != 32)
{
......@@ -1933,7 +1944,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float myresult = %(reduce_init)s;
%(dtype)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
%(reduce_fct)s;
......@@ -1956,13 +1967,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int C,
const int D,
//const int E, // THIS is 32
const float *X, const int sX0,
const %(dtype)s *X, const int sX0,
const int sX1, const int sX2,
float * Z, const int sZ0, const int sZ1)
%(dtype)s * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
float myresult = 0.0f;
%(dtype)s myresult = 0.0f;
if (warpSize != 32)
{
......@@ -2050,14 +2061,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const %(dtype)s *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0)
%(dtype)s * Z, const int sZ0)
{
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult = %(reduce_init)s;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = %(reduce_init)s;
if (warpSize != 32)
{
......@@ -2145,13 +2156,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
const int d0,
const int d1,
const int d2,
const float *A, const int sA0,
const %(dtype)s *A, const int sA0,
const int sA1, const int sA2,
float * Z, const int sZ0, const int sZ1)
%(dtype)s * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
extern __shared__ %(dtype)s buf[];
if (warpSize != 32)
{
......@@ -2162,7 +2173,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float myresult = %(reduce_init)s;
%(dtype)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
%(reduce_fct)s;
......@@ -2192,7 +2203,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float myresult = %(reduce_init)s;
%(dtype)s myresult = %(reduce_init)s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
......@@ -2225,7 +2236,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float myresult = %(reduce_init)s;
%(dtype)s myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
......@@ -2279,14 +2290,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
const unsigned int d1,
const unsigned int d2,
const unsigned int d3,
const float *A, const int sA0, const int sA1,
const %(dtype)s *A, const int sA0, const int sA1,
const int sA2, const int sA3,
float * Z, const int sZ0)
%(dtype)s * Z, const int sZ0)
{
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float myresult = %(reduce_init)s;
extern __shared__ %(dtype)s buf[];
%(dtype)s myresult = %(reduce_init)s;
if (warpSize != 32)
{
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论