提交 312da2df authored 作者: Frederic's avatar Frederic

Move more reduction to the GPU.

上级 e52009c7
......@@ -86,6 +86,29 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
register_opt(name='gpu_constant_folding')(
tensor.opt.constant_folding)
# This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date.
import theano.tensor.signal.downsample
import theano.sandbox.neighbours
cpu_ops_moved_to_gpu = [
tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
tensor.signal.downsample.DownsampleFactorMax,
tensor.signal.downsample.DownsampleFactorMaxGrad,
theano.sandbox.neighbours.Images2Neibs,
tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
tensor.nnet.Softmax, tensor.nnet.SoftmaxWithBias,
tensor.Elemwise, tensor.DimShuffle, tensor.CAReduce,
tensor.elemwise.All, tensor.elemwise.Any,
tensor.elemwise.CAReduceDtype, tensor.elemwise.Sum,
tensor.elemwise.Prod, tensor.elemwise.ProdWithoutZeros,
tensor.Reshape, tensor.Flatten, tensor.Subtensor,
tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
tensor.IncSubtensor, tensor.Shape, tensor.Join,
tensor.Alloc, tensor.Eye]
class InputToGpuOptimizer(Optimizer):
"""
......@@ -617,7 +640,33 @@ def local_gpu_careduce(node):
if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
scal.Maximum, scal.Minimum)):
x, = node.inputs
replace = False
if x.owner and isinstance(x.owner.op, HostFromGpu):
replace = True
elif (all([c != "output" and isinstance(c.op, GpuFromHost)
for c, i in node.outputs[0].clients])
and x.owner and x.owner.op.__class__ in
cpu_ops_moved_to_gpu):
# It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the
# reduction input. It mean we will transfer the
# (bigger) input to the GPU instead of the
# output(smaller) if we stop optimization there. Most
# of the time, we will also move to the GPU what
# created the input of the reduction. In that case, we
# don't introduce a bigger transfer. It is hard to
# know if after all optimization we will do the bigger
# transfer or not. I'm guessing an heuristic to find
# that. I suppose that if the input of the recution is
# generated by an op that we can in some cases move to
# the GPU, that we will move it. If some CPU ops are
# supported only in some cases on the GPU, this will
# move to the GPU the reduction when it wasn't a good
# idea.
replace = True
if replace:
if node.op.axis is None:
reduce_mask = [1] * x.type.ndim
else:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论