[ENH] Make opt use the opencl version of the op for opencl device

0ac9ec62 · Frederic · 58e9b7fb · 0ac9ec62 · 0ac9ec62
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -31,7 +31,8 @@ from theano.sandbox.gpuarray.nnet import (
    GpuSoftmaxWithBias, GpuSoftmax
 )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduceCuda)
+                                              GpuDimShuffle, GpuCAReduceCuda,
+                                              GpuCAReduceCPY)
 from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
                                               GpuAdvancedIncSubtensor1,
                                               GpuAdvancedIncSubtensor1_dev20)
@@ -366,15 +367,25 @@ def local_gpua_advanced_incsubtensor(node):
 def local_gpua_careduce(node):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
+        dev = theano.sandbox.gpuarray.init_dev.device
+        if dev.startswith('opencl'):
+            op = GpuCAReduceCPY
+            if node.op.scalar_op not in [scalar.add, scalar.mul]:
+                # We don't support yet all reduction with cpy code.
+                return
+        else:
+            op = GpuCAReduceCuda
        x, = node.inputs
-        greduce = GpuCAReduceCuda(
+        greduce = op(
            node.op.scalar_op, axis=node.op.axis,
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None
-        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
+        if (op is GpuCAReduceCPY or
+            gvar.owner.op.supports_c_code([gpu_from_host(x)])):
            return greduce
        else:
            # Try to make a simpler pattern based on reshaping
@@ -407,7 +418,7 @@ def local_gpua_careduce(node):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            greduce = GpuCAReduceCuda(
+            greduce = op(
                node.op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
                dtype=getattr(node.op, 'dtype', None),

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -7,7 +7,8 @@ import theano.sandbox.gpuarray
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
    GpuAlloc, GpuReshape, gpu_alloc, gpu_from_host, host_from_gpu)
-from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda, GpuElemwise
+from theano.sandbox.gpuarray.elemwise import (
+    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise)
 from theano.sandbox.gpuarray.tests.test_basic_ops import (
    rand_gpuarray, mode_with_gpu, mode_without_gpu
    )
@@ -50,17 +51,26 @@ def test_flatten():
 def test_reduce():
-    for method in ['sum', 'prod', 'max', 'min']:
+    dev = theano.sandbox.gpuarray.init_dev.device
+    for method, param in [('sum', dict(acc_dtype='float32')),
+                          ('prod', dict(acc_dtype='float32')),
+                          ('max', {}), ('min', {})]:
        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(axis=0),
+        f = theano.function([m], getattr(m, method)(axis=0,
+                                                    **param),
                            mode=mode_with_gpu)
        val = numpy.random.rand(10, 11).astype("float32")
        res = f(val)
        utt.assert_allclose(res, getattr(val, method)(axis=0))
        assert res.shape == (11,)
        topo = f.maker.fgraph.toposort()
-        assert GpuCAReduceCuda in [type(node.op)
+        ops = [type(node.op) for node in topo]
-                                   for node in topo], topo
+        if dev.startswith('opencl') and method in ["max", "min"]:
+            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
+        else:
+            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
 def test_local_gpualloc_memset_0():