Move more reduction to the GPU.

312da2df · Frederic · e52009c7 · 312da2df
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -86,6 +86,29 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
 register_opt(name='gpu_constant_folding')(
    tensor.opt.constant_folding)

+# This is a partial list of CPU ops that can be in some circonstance
+# moved to the GPU. This list is used by an optimization.
+# Hopefully, we can keep this list up to date.
+import theano.tensor.signal.downsample
+import theano.sandbox.neighbours
+cpu_ops_moved_to_gpu = [
+    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
+    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
+    tensor.signal.downsample.DownsampleFactorMax,
+    tensor.signal.downsample.DownsampleFactorMaxGrad,
+    theano.sandbox.neighbours.Images2Neibs,
+    tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
+    tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
+    tensor.nnet.Softmax, tensor.nnet.SoftmaxWithBias,
+    tensor.Elemwise, tensor.DimShuffle, tensor.CAReduce,
+    tensor.elemwise.All, tensor.elemwise.Any,
+    tensor.elemwise.CAReduceDtype, tensor.elemwise.Sum,
+    tensor.elemwise.Prod, tensor.elemwise.ProdWithoutZeros,
+    tensor.Reshape, tensor.Flatten, tensor.Subtensor,
+    tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
+    tensor.IncSubtensor, tensor.Shape, tensor.Join,
+    tensor.Alloc, tensor.Eye]
+

 class InputToGpuOptimizer(Optimizer):
    """
@@ -617,7 +640,33 @@ def local_gpu_careduce(node):
        if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
                                          scal.Maximum, scal.Minimum)):
            x, = node.inputs
+            replace = False
            if x.owner and isinstance(x.owner.op, HostFromGpu):
+                replace = True
+            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
+                      for c, i in node.outputs[0].clients])
+                  and x.owner and x.owner.op.__class__ in
+                  cpu_ops_moved_to_gpu):
+                # It is not always good to transfer the reduction to
+                # the GPU when the clients are on the GPU but not the
+                # reduction input. It mean we will transfer the
+                # (bigger) input to the GPU instead of the
+                # output(smaller) if we stop optimization there. Most
+                # of the time, we will also move to the GPU what
+                # created the input of the reduction. In that case, we
+                # don't introduce a bigger transfer. It is hard to
+                # know if after all optimization we will do the bigger
+                # transfer or not. I'm guessing an heuristic to find
+                # that. I suppose that if the input of the recution is
+                # generated by an op that we can in some cases move to
+                # the GPU, that we will move it. If some CPU ops are
+                # supported only in some cases on the GPU, this will
+                # move to the GPU the reduction when it wasn't a good
+                # idea.
+
+                replace = True
+
+            if replace:
                if node.op.axis is None:
                    reduce_mask = [1] * x.type.ndim
                else: