提交 01b8c32e authored 作者: Frederic's avatar Frederic

add gpu max for pattern (0, 1) and added all gpu max pattern for gpu min.

上级 9a4181b6
...@@ -1189,7 +1189,9 @@ class GpuCAReduce(GpuOp): ...@@ -1189,7 +1189,9 @@ class GpuCAReduce(GpuOp):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail): def c_code_reduce_10(self, sio, node, name, x, z, fail):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add, scal.Minimum,
scal.Maximum)):
raise NotImplementedError()
print >> sio, """ print >> sio, """
{ {
int verbose = 0; int verbose = 0;
...@@ -1759,9 +1761,12 @@ class GpuCAReduce(GpuOp): ...@@ -1759,9 +1761,12 @@ class GpuCAReduce(GpuOp):
} }
} }
""" % locals() """ % locals()
elif isinstance(self.scalar_op, scal.Maximum): elif isinstance(self.scalar_op, (scal.Maximum, scal.Minimum)):
# special cased max code (special case because visits first # special cased max code (special case because visits first
# member of each row twice) # member of each row twice)
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
{})
print >> sio, """ print >> sio, """
%(decl)s{ %(decl)s{
%(init)s %(init)s
...@@ -1770,8 +1775,7 @@ class GpuCAReduce(GpuOp): ...@@ -1770,8 +1775,7 @@ class GpuCAReduce(GpuOp):
%(for_i1)s{ %(for_i1)s{
%(for_i2)s{ %(for_i2)s{
%(for_i3)s{ %(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]; %(reduce_fct)s;
myresult = max(myresult, Ai);
} }
} }
} }
...@@ -1791,14 +1795,24 @@ class GpuCAReduce(GpuOp): ...@@ -1791,14 +1795,24 @@ class GpuCAReduce(GpuOp):
# code to make sure it does not cause a slowdown # code to make sure it does not cause a slowdown
raise NotImplementedError() raise NotImplementedError()
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
self._op_guard() if not isinstance(self.scalar_op, (scal.Add, scal.Minimum,
scal.Maximum)):
raise NotImplementedError()
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is #TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous # c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column). # memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', node, nodename, sub = {}) reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
node, nodename, sub={})
reduce_fct = self._assign_reduce(node, nodename, "myresult",
"A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
{})
if isinstance(self.scalar_op, scal.Add):
reduce_init = "0.f;"
else:
reduce_init = "A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2];"
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_010_%(nodename)s( static __global__ void kernel_reduce_010_%(nodename)s(
const int d0, const int d0,
...@@ -1822,10 +1836,10 @@ class GpuCAReduce(GpuOp): ...@@ -1822,10 +1836,10 @@ class GpuCAReduce(GpuOp):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
float myresult = 0.0f; float myresult = %(reduce_init)s;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{ {
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; %(reduce_fct)s;
} }
%(reducebuf)s %(reducebuf)s
} }
...@@ -2307,6 +2321,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2307,6 +2321,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
return () return ()
return (3, hv) return (3, hv)
class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
......
...@@ -597,7 +597,7 @@ def local_gpu_careduce(node): ...@@ -597,7 +597,7 @@ def local_gpu_careduce(node):
scalar_op = node.op.scalar_op scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all, # currently, only these two ops are supported at all,
# and max does not support all combinations of axes # and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.maximum]: if node.op.scalar_op in [scal.add, scal.maximum, scal.minimum]:
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None: if node.op.axis is None:
......
...@@ -66,7 +66,8 @@ def test_careduce(): ...@@ -66,7 +66,8 @@ def test_careduce():
""" """
for scalar_op, careduce_op in [ for scalar_op, careduce_op in [
(theano.scalar.add, tensor.elemwise.CAReduceDtype), (theano.scalar.add, tensor.elemwise.CAReduceDtype),
(theano.scalar.maximum, tensor.CAReduce)]: (theano.scalar.maximum, tensor.CAReduce),
(theano.scalar.minimum, tensor.CAReduce)]:
for shape, pattern in [((1,1),(1,)), for shape, pattern in [((1,1),(1,)),
((1,0),(1,)), ((1,0),(1,)),
((0,1),(1,)), ((0,1),(1,)),
...@@ -123,9 +124,10 @@ def test_careduce(): ...@@ -123,9 +124,10 @@ def test_careduce():
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns #GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [ if scalar_op in [theano.scalar.maximum,
(0, 1), (0, 1, 1), (0, 1, 1)]: theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
continue continue
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
...@@ -191,10 +193,12 @@ def test_careduce(): ...@@ -191,10 +193,12 @@ def test_careduce():
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns #GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [ if scalar_op in [theano.scalar.maximum,
(0, 1), (0, 1, 1), (0, 1, 1)]: theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
continue continue
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape)) dim_pattern = range(len(shape))
dim_pattern[0] = 1 dim_pattern[0] = 1
...@@ -223,10 +227,12 @@ def test_careduce(): ...@@ -223,10 +227,12 @@ def test_careduce():
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern)
#GpuCAReduce{maximum} support only those patterns #GpuCAReduce{maximum/minimum} support only those patterns
if scalar_op is theano.scalar.maximum and pat not in [ if scalar_op in [theano.scalar.maximum,
(0, 1), (0, 1, 1), (0, 1, 1)]: theano.scalar.minimum] and pat not in [
(0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
continue continue
shape = numpy.asarray(shape) * 2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论