提交 aa68b41e authored 作者: Ian Goodfellow's avatar Ian Goodfellow

updated local_gpu_careduce optimization to add a GpuCAReduce iff it

supports C code bug fix
上级 580e1614
...@@ -1453,9 +1453,12 @@ class GpuCAReduce(GpuOp): ...@@ -1453,9 +1453,12 @@ class GpuCAReduce(GpuOp):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) # TODO: make this act like tensor.elemwise.CAReduce
# needs to include scalar op's cache version in the returned tuple
return ()
def _op_guard(self): def _op_guard(self):
""" Raises NotImplementedError if op is not Add """
if not isinstance(self.scalar_op, theano.scalar.basic.Add): if not isinstance(self.scalar_op, theano.scalar.basic.Add):
raise NotImplementedError() raise NotImplementedError()
......
...@@ -581,10 +581,15 @@ def local_gpu_gemm(node): ...@@ -581,10 +581,15 @@ def local_gpu_gemm(node):
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_sum(node): def local_gpu_careduce(node):
if isinstance(node.op, tensor.elemwise.CAReduce): if isinstance(node.op, tensor.elemwise.CAReduce):
if node.op.scalar_op == scal.add: scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all,
# and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.maximum]:
x, = node.inputs x, = node.inputs
gpu_x = gpu_from_host(x)
gpu_inputs = [ gpu_x ]
if x.owner and x.owner.op == host_from_gpu: if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None: if node.op.axis is None:
reduce_mask = [1] * x.type.ndim reduce_mask = [1] * x.type.ndim
...@@ -593,22 +598,21 @@ def local_gpu_sum(node): ...@@ -593,22 +598,21 @@ def local_gpu_sum(node):
for a in node.op.axis: for a in node.op.axis:
assert reduce_mask[a] == 0 assert reduce_mask[a] == 0
reduce_mask[a] = 1 reduce_mask[a] = 1
gsum = GpuCAReduce(reduce_mask, theano.scalar.basic.add) greduce = GpuCAReduce(reduce_mask, scalar_op)
pattern = (''.join(str(i) for i in reduce_mask)) if greduce.supports_c_code(gpu_inputs):
if hasattr(gsum, 'c_code_reduce_%s' % pattern): rval = host_from_gpu(greduce(gpu_x))
rval = host_from_gpu(gsum(gpu_from_host(x)))
if rval.type == node.outputs[0].type: if rval.type == node.outputs[0].type:
return [rval] return [rval]
else: else:
print >> sys.stderr, \ print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong" "WARNING: local_gpu_careduce got type wrong"
return None return None
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have # The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape # the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the sum, and then # to make them a single dimension, do the reduction, and then
# reshape to get them back. # reshape to get them back.
shape_of = node.fgraph.shape_feature.shape_of shape_of = node.fgraph.shape_feature.shape_of
...@@ -624,27 +628,28 @@ def local_gpu_sum(node): ...@@ -624,27 +628,28 @@ def local_gpu_sum(node):
new_mask.append(reduce_mask[i]) new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i]) new_in_shp.append(x_shape[i])
pattern = (''.join(str(i) for i in new_mask)) new_greduce = GpuCAReduce(new_mask, scalar_op)
new_gsum = GpuCAReduce(new_mask, theano.scalar.basic.add) reshaped_x = x.reshape(tensor.stack(*new_in_shp))
if hasattr(new_gsum, 'c_code_reduce_%s' % pattern): gpu_reshaped_x = gpu_from_host(reshaped_x)
reshaped_x = x.reshape(tensor.stack(*new_in_shp)) reshaped_gpu_inputs = [ gpu_reshaped_x ]
sum_reshaped_x = host_from_gpu( if new_greduce.supports_c_code(reshaped_gpu_inputs):
new_gsum(gpu_from_host(reshaped_x))) reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x))
if sum_reshaped_x.ndim != node.outputs[0].ndim: if reduce_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_sum = sum_reshaped_x.reshape( unreshaped_reduce = reduce_reshaped_x.reshape(
tensor.stack(*shape_of[node.outputs[0]])) tensor.stack(*shape_of[node.outputs[0]]))
else: else:
unreshaped_sum = sum_reshaped_x unreshaped_reduce = reduce_reshaped_x
if unreshaped_sum.type == node.outputs[0].type: if unreshaped_reduce.type == node.outputs[0].type:
return [unreshaped_sum] return [unreshaped_reduce]
else: else:
print >> sys.stderr, \ print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong" "WARNING: local_gpu_careduce got type wrong"
return None return None
raise Exception( raise Exception(
"GpuCAReduce don't have implemented the pattern", "GpuCAReduce does not yet implement this pattern:",
pattern) pattern)
return False return False
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论