提交 91c8fcf3 authored 作者: nouiz's avatar nouiz

Merge pull request #934 from goodfeli/gpu_max

Gpu max
...@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type: ...@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type:
This must return C code that carries the computation we want to do. This must return C code that carries the computation we want to do.
sub is a dictionary of strings for you to substitute into your code.
It's not clear if it ever contains anything other than 'fail'.
sub['fail'] is a string of code that you should execute (after calling
PyErr_Format) if your C code needs to raise an exception.
.. method:: c_code_cleanup(node, name, input_names, output_names, sub) .. method:: c_code_cleanup(node, name, input_names, output_names, sub)
This must return C code that cleans up whatever c_code allocated and This must return C code that cleans up whatever c_code allocated and
......
...@@ -270,7 +270,7 @@ if cuda_available: ...@@ -270,7 +270,7 @@ if cuda_available:
import basic_ops import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous, GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor, GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1, GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
GpuFlatten, GpuShape, GpuAlloc, GpuFlatten, GpuShape, GpuAlloc,
......
...@@ -582,9 +582,12 @@ def local_gpu_gemm(node): ...@@ -582,9 +582,12 @@ def local_gpu_gemm(node):
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_sum(node): def local_gpu_careduce(node):
if isinstance(node.op, tensor.elemwise.CAReduce): if isinstance(node.op, tensor.elemwise.CAReduce):
if node.op.scalar_op == scal.add: scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all,
# and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.maximum]:
x, = node.inputs x, = node.inputs
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None: if node.op.axis is None:
...@@ -594,22 +597,21 @@ def local_gpu_sum(node): ...@@ -594,22 +597,21 @@ def local_gpu_sum(node):
for a in node.op.axis: for a in node.op.axis:
assert reduce_mask[a] == 0 assert reduce_mask[a] == 0
reduce_mask[a] = 1 reduce_mask[a] = 1
gsum = GpuSum(reduce_mask) greduce = GpuCAReduce(reduce_mask, scalar_op)
pattern = (''.join(str(i) for i in reduce_mask)) if greduce.supports_c_code([gpu_from_host(x)]):
if hasattr(gsum, 'c_code_reduce_%s' % pattern): rval = host_from_gpu(greduce(gpu_from_host(x)))
rval = host_from_gpu(gsum(gpu_from_host(x)))
if rval.type == node.outputs[0].type: if rval.type == node.outputs[0].type:
return [rval] return [rval]
else: else:
print >> sys.stderr, \ print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong" "WARNING: local_gpu_careduce got type wrong"
return None return None
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have # The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape # the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the sum, and then # to make them a single dimension, do the reduction, and then
# reshape to get them back. # reshape to get them back.
shape_of = node.fgraph.shape_feature.shape_of shape_of = node.fgraph.shape_feature.shape_of
...@@ -625,27 +627,28 @@ def local_gpu_sum(node): ...@@ -625,27 +627,28 @@ def local_gpu_sum(node):
new_mask.append(reduce_mask[i]) new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i]) new_in_shp.append(x_shape[i])
pattern = (''.join(str(i) for i in new_mask)) new_greduce = GpuCAReduce(new_mask, scalar_op)
new_gsum = GpuSum(new_mask) reshaped_x = x.reshape(tensor.stack(*new_in_shp))
if hasattr(new_gsum, 'c_code_reduce_%s' % pattern): gpu_reshaped_x = gpu_from_host(reshaped_x)
reshaped_x = x.reshape(tensor.stack(*new_in_shp)) reshaped_gpu_inputs = [ gpu_reshaped_x ]
sum_reshaped_x = host_from_gpu( if new_greduce.supports_c_code(reshaped_gpu_inputs):
new_gsum(gpu_from_host(reshaped_x))) reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x))
if sum_reshaped_x.ndim != node.outputs[0].ndim: if reduce_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_sum = sum_reshaped_x.reshape( unreshaped_reduce = reduce_reshaped_x.reshape(
tensor.stack(*shape_of[node.outputs[0]])) tensor.stack(*shape_of[node.outputs[0]]))
else: else:
unreshaped_sum = sum_reshaped_x unreshaped_reduce = reduce_reshaped_x
if unreshaped_sum.type == node.outputs[0].type: if unreshaped_reduce.type == node.outputs[0].type:
return [unreshaped_sum] return [unreshaped_reduce]
else: else:
print >> sys.stderr, \ print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong" "WARNING: local_gpu_careduce got type wrong"
return None return None
raise Exception( raise Exception(
"GpuSum don't have implemented the pattern", "GpuCAReduce does not yet implement this pattern:",
pattern) pattern)
return False return False
......
...@@ -28,7 +28,10 @@ def test_nvidia_driver1(): ...@@ -28,7 +28,10 @@ def test_nvidia_driver1():
profile=False) profile=False)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 2
assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1 if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
'but got:']+[str(app) for app in topo])
raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()): if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS " raise Exception("The nvidia driver version installed with this OS "
"does not give good results for reduction." "does not give good results for reduction."
......
...@@ -44,11 +44,11 @@ def test_int_pow(): ...@@ -44,11 +44,11 @@ def test_int_pow():
f = theano.function([a], (a*4).sum(), mode=mode_with_gpu) f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu'] assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu) f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu'] assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
......
...@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp): ...@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp):
gx = eq(output, x) * gz gx = eq(output, x) * gz
gy = eq(output, y) * gz gy = eq(output, y) * gz
return (gx, gy) return (gx, gy)
maximum = Maximum(upcast_out, name='maximum') maximum = Maximum(upcast_out, name='maximum')
...@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp): ...@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp):
gx = eq(output, x) * gz gx = eq(output, x) * gz
gy = eq(output, y) * gz gy = eq(output, y) * gz
return (gx, gy) return (gx, gy)
minimum = Minimum(upcast_out, name='minimum') minimum = Minimum(upcast_out, name='minimum')
...@@ -1222,6 +1222,8 @@ class Add(ScalarOp): ...@@ -1222,6 +1222,8 @@ class Add(ScalarOp):
for i in inputs: for i in inputs:
retval += [gz] retval += [gz]
return retval return retval
add = Add(upcast_out, name='add') add = Add(upcast_out, name='add')
......
...@@ -1082,14 +1082,16 @@ class Elemwise(Op): ...@@ -1082,14 +1082,16 @@ class Elemwise(Op):
class CAReduce(Op): class CAReduce(Op):
""" """
CAReduce = Commutative Associative Reduce
Reduces a scalar operation along the specified axis(es). Reduces a scalar operation along the specified axis(es).
(The scalar op should be both commutative and assocative)
The output will have the same shape as the input minus the reduced The output will have the same shape as the input minus the reduced
dimensions. It will contain the variable of accumulating all values dimensions. It will contain the variable of accumulating all values
over the reduced dimensions using the specified scalar op. over the reduced dimensions using the specified scalar op.
Examples: Examples:
CAReduce(add) -> sum CAReduce(add) -> sum (ie, acts like the numpy sum operation)
CAReduce(mul) -> product CAReduce(mul) -> product
CAReduce(maximum) -> max CAReduce(maximum) -> max
CAReduce(minimum) -> min CAReduce(minimum) -> min
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论