提交 1b3c8f1b authored 作者: Frederic's avatar Frederic

Implement all missing case of GPU min/max that was supported by GPU sum.

This mean the not available case 1010, 1011 and 0111. The following case where implemented with reshape for min/max, but not for sum:110, 001, 0011, 100.
上级 827cb8cc
...@@ -1361,7 +1361,6 @@ class GpuCAReduce(GpuOp): ...@@ -1361,7 +1361,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_0101(self, sio, node, name, x, z, fail): def c_code_reduce_0101(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
...@@ -1381,7 +1380,6 @@ class GpuCAReduce(GpuOp): ...@@ -1381,7 +1380,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_100(self, sio, node, name, x, z, fail): def c_code_reduce_100(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
# use threadIdx.x for i0 # use threadIdx.x for i0
# use blockIdx.x for i1 # use blockIdx.x for i1
...@@ -1402,7 +1400,6 @@ class GpuCAReduce(GpuOp): ...@@ -1402,7 +1400,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_110(self, sio, node, name, x, z, fail): def c_code_reduce_110(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
...@@ -1424,7 +1421,6 @@ class GpuCAReduce(GpuOp): ...@@ -1424,7 +1421,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_001(self, sio, node, name, x, z, fail): def c_code_reduce_001(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
...@@ -1479,7 +1475,6 @@ class GpuCAReduce(GpuOp): ...@@ -1479,7 +1475,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_0011(self, sio, node, name, x, z, fail): def c_code_reduce_0011(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
...@@ -1542,7 +1537,6 @@ class GpuCAReduce(GpuOp): ...@@ -1542,7 +1537,6 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail): def c_code_reduce_1011(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
print >> sio, """ print >> sio, """
{ {
...@@ -1960,7 +1954,10 @@ class GpuCAReduce(GpuOp): ...@@ -1960,7 +1954,10 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 1, 0): if self.reduce_mask == (1, 1, 0):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -1968,6 +1965,13 @@ class GpuCAReduce(GpuOp): ...@@ -1968,6 +1965,13 @@ class GpuCAReduce(GpuOp):
# c_contiguous (typical case) then each warp is accessing non-contigous # c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {}) reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[blockIdx.x * sA2];"
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_110_%(nodename)s( static __global__ void kernel_reduce_110_%(nodename)s(
const int d0, const int d0,
...@@ -1980,7 +1984,7 @@ class GpuCAReduce(GpuOp): ...@@ -1980,7 +1984,7 @@ class GpuCAReduce(GpuOp):
const int threadCount = blockDim.x * blockDim.y; const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[]; extern __shared__ float buf[];
float myresult = 0.0f; float myresult = %(reduce_init)s;
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -1993,8 +1997,7 @@ class GpuCAReduce(GpuOp): ...@@ -1993,8 +1997,7 @@ class GpuCAReduce(GpuOp):
{ {
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{ {
float Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]; %(reduce_fct)s;
myresult += Ai;
} }
} }
...@@ -2002,11 +2005,21 @@ class GpuCAReduce(GpuOp): ...@@ -2002,11 +2005,21 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 0, 0): if self.reduce_mask == (1, 0, 0):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i1 * sA1 + i2 * sA2]"
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
{ {
...@@ -2015,10 +2028,10 @@ class GpuCAReduce(GpuOp): ...@@ -2015,10 +2028,10 @@ class GpuCAReduce(GpuOp):
{ {
for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x) for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
{ {
myresult = 0; myresult = %(reduce_init)s;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; %(reduce_fct)s
} }
%(reducebuf)s %(reducebuf)s
} }
...@@ -2060,11 +2073,21 @@ class GpuCAReduce(GpuOp): ...@@ -2060,11 +2073,21 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 0, 1): if self.reduce_mask == (0, 0, 1):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub = {}) node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i0 * sA0 + i1 * sA1]"
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_001_%(nodename)s( static __global__ void kernel_reduce_001_%(nodename)s(
const int d0, const int d0,
...@@ -2087,10 +2110,10 @@ class GpuCAReduce(GpuOp): ...@@ -2087,10 +2110,10 @@ class GpuCAReduce(GpuOp):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
float myresult = 0.0f; float myresult = %(reduce_init)s;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x) for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; %(reduce_fct)s;
} }
%(reducebuf)s %(reducebuf)s
} }
...@@ -2098,13 +2121,23 @@ class GpuCAReduce(GpuOp): ...@@ -2098,13 +2121,23 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub = {}) node, nodename, sub = {})
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i0 * sA0 + i1 * sA1]"
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
{ {
...@@ -2114,12 +2147,12 @@ class GpuCAReduce(GpuOp): ...@@ -2114,12 +2147,12 @@ class GpuCAReduce(GpuOp):
{ {
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y) for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{ {
float myresult = 0.0f; float myresult = %(reduce_init)s;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y) for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; %(reduce_fct)s;
} }
} }
%(reducebuf)s %(reducebuf)s
...@@ -2128,13 +2161,23 @@ class GpuCAReduce(GpuOp): ...@@ -2128,13 +2161,23 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (0, 1, 0, 1): if self.reduce_mask == (0, 1, 0, 1):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
node, nodename, sub = {}) node, nodename, sub = {})
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i0 * sA0 + i2 * sA2]"
print >> sio, """ print >> sio, """
%(decl)s %(decl)s
{ {
...@@ -2144,12 +2187,12 @@ class GpuCAReduce(GpuOp): ...@@ -2144,12 +2187,12 @@ class GpuCAReduce(GpuOp):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
float myresult = 0.0f; float myresult = %(reduce_init)s;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y) for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]; %(reduce_fct)s;
} }
} }
%(reducebuf)s %(reducebuf)s
...@@ -2193,9 +2236,19 @@ class GpuCAReduce(GpuOp): ...@@ -2193,9 +2236,19 @@ class GpuCAReduce(GpuOp):
} }
""" % locals() """ % locals()
if self.reduce_mask == (1, 0, 1, 1): if self.reduce_mask == (1, 0, 1, 1):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add,
scal.Maximum,
scal.Minimum)):
raise NotImplementedError()
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]', reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub = {}) node, nodename, sub = {})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[blockIdx.x * sA1]"
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_1011_%(nodename)s( static __global__ void kernel_reduce_1011_%(nodename)s(
const unsigned int d0, const unsigned int d0,
...@@ -2209,7 +2262,7 @@ class GpuCAReduce(GpuOp): ...@@ -2209,7 +2262,7 @@ class GpuCAReduce(GpuOp):
const int threadCount = blockDim.x * blockDim.y * blockDim.z; const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[]; extern __shared__ float buf[];
float myresult = 0.0f; float myresult = %(reduce_init)s;
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -2222,8 +2275,7 @@ class GpuCAReduce(GpuOp): ...@@ -2222,8 +2275,7 @@ class GpuCAReduce(GpuOp):
{ {
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x) for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{ {
float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]; %(reduce_fct)s;
myresult += Ai;
} }
} }
} }
......
...@@ -124,21 +124,6 @@ def test_careduce(): ...@@ -124,21 +124,6 @@ def test_careduce():
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op in [theano.scalar.maximum,
theano.scalar.minimum] and pat not in [
(1,), (1, 1), (0, 1), (1, 0),
(0, 1, 0), (0, 1, 1), (1, 1, 1),
(1, 0, 0, 0), (0, 1, 0, 0),
(0, 0, 1, 0), (0, 0, 0, 1),
(1, 1, 1, 1), (1, 1, 1, 1, 1),
(0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0),
(0, 0, 1, 1), # by reshape
# (0, 1, 0, 1), #not supported for max/min
(0, 1, 1, 1), # by reshape
#(1, 0, 1, 1) #not supported for max/min
]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a) b = op(a)
...@@ -206,11 +191,6 @@ def test_careduce(): ...@@ -206,11 +191,6 @@ def test_careduce():
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op in [theano.scalar.maximum,
theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape)) dim_pattern = range(len(shape))
...@@ -243,11 +223,6 @@ def test_careduce(): ...@@ -243,11 +223,6 @@ def test_careduce():
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op in [theano.scalar.maximum,
theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
continue
shape = numpy.asarray(shape) * 2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论