提交 91c8fcf3 authored 作者: nouiz's avatar nouiz

Merge pull request #934 from goodfeli/gpu_max

Gpu max
......@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type:
This must return C code that carries the computation we want to do.
sub is a dictionary of strings for you to substitute into your code.
It's not clear if it ever contains anything other than 'fail'.
sub['fail'] is a string of code that you should execute (after calling
PyErr_Format) if your C code needs to raise an exception.
.. method:: c_code_cleanup(node, name, input_names, output_names, sub)
This must return C code that cleans up whatever c_code allocated and
......
......@@ -270,7 +270,7 @@ if cuda_available:
import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
GpuFlatten, GpuShape, GpuAlloc,
......
......@@ -582,9 +582,12 @@ def local_gpu_gemm(node):
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
def local_gpu_careduce(node):
if isinstance(node.op, tensor.elemwise.CAReduce):
if node.op.scalar_op == scal.add:
scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all,
# and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.maximum]:
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None:
......@@ -594,22 +597,21 @@ def local_gpu_sum(node):
for a in node.op.axis:
assert reduce_mask[a] == 0
reduce_mask[a] = 1
gsum = GpuSum(reduce_mask)
pattern = (''.join(str(i) for i in reduce_mask))
if hasattr(gsum, 'c_code_reduce_%s' % pattern):
rval = host_from_gpu(gsum(gpu_from_host(x)))
greduce = GpuCAReduce(reduce_mask, scalar_op)
if greduce.supports_c_code([gpu_from_host(x)]):
rval = host_from_gpu(greduce(gpu_from_host(x)))
if rval.type == node.outputs[0].type:
return [rval]
else:
print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
"WARNING: local_gpu_careduce got type wrong"
return None
else:
# Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the sum, and then
# to make them a single dimension, do the reduction, and then
# reshape to get them back.
shape_of = node.fgraph.shape_feature.shape_of
......@@ -625,27 +627,28 @@ def local_gpu_sum(node):
new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i])
pattern = (''.join(str(i) for i in new_mask))
new_gsum = GpuSum(new_mask)
if hasattr(new_gsum, 'c_code_reduce_%s' % pattern):
new_greduce = GpuCAReduce(new_mask, scalar_op)
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
sum_reshaped_x = host_from_gpu(
new_gsum(gpu_from_host(reshaped_x)))
if sum_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_sum = sum_reshaped_x.reshape(
gpu_reshaped_x = gpu_from_host(reshaped_x)
reshaped_gpu_inputs = [ gpu_reshaped_x ]
if new_greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x))
if reduce_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_reduce = reduce_reshaped_x.reshape(
tensor.stack(*shape_of[node.outputs[0]]))
else:
unreshaped_sum = sum_reshaped_x
if unreshaped_sum.type == node.outputs[0].type:
return [unreshaped_sum]
unreshaped_reduce = reduce_reshaped_x
if unreshaped_reduce.type == node.outputs[0].type:
return [unreshaped_reduce]
else:
print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
"WARNING: local_gpu_careduce got type wrong"
return None
raise Exception(
"GpuSum don't have implemented the pattern",
"GpuCAReduce does not yet implement this pattern:",
pattern)
return False
......
......@@ -28,7 +28,10 @@ def test_nvidia_driver1():
profile=False)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
'but got:']+[str(app) for app in topo])
raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS "
"does not give good results for reduction."
......
......@@ -44,11 +44,11 @@ def test_int_pow():
f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu']
assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu']
assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
#theano.printing.debugprint(f)
......
......@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp):
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
maximum = Maximum(upcast_out, name='maximum')
......@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp):
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
minimum = Minimum(upcast_out, name='minimum')
......@@ -1222,6 +1222,8 @@ class Add(ScalarOp):
for i in inputs:
retval += [gz]
return retval
add = Add(upcast_out, name='add')
......
......@@ -1082,14 +1082,16 @@ class Elemwise(Op):
class CAReduce(Op):
"""
CAReduce = Commutative Associative Reduce
Reduces a scalar operation along the specified axis(es).
(The scalar op should be both commutative and assocative)
The output will have the same shape as the input minus the reduced
dimensions. It will contain the variable of accumulating all values
over the reduced dimensions using the specified scalar op.
Examples:
CAReduce(add) -> sum
CAReduce(add) -> sum (ie, acts like the numpy sum operation)
CAReduce(mul) -> product
CAReduce(maximum) -> max
CAReduce(minimum) -> min
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论