Merge pull request #1835 from nouiz/gpureduce

Gpureduce: support multiple dtype, prod, max and min

Merge pull request #1835 from nouiz/gpureduce
9ad79667 · abergeron · 43a86c9e · 254dd8b7 · 9ad79667 · 9ad79667
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -364,8 +364,7 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
    that are neither in "updates" nor in "no_default_updates".
    :type name: None or string
-    :param name: attaches a name to the Profiling result of this function when
+    :param name: attaches a name to the profiling result of this function.
-    using ProfileMode (will be deprecated).
    :type allow_input_downcast: Boolean
    :param allow_input_downcast: True means that the values passed as

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -258,7 +258,7 @@ class Container(object):
        """WRITEME
        :Parameters:
-         `r`: a variable
+         `r`: a Variable or a Type
         `storage`: a list of length 1, whose element is the value for `r`
         `readonly`: True indicates that this should not be setable by Function[r] = val
         `strict`: if True, we don't allow type casting.

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -215,7 +215,7 @@ if __name__ == "__main__":
        C1060                                                0.46s
        GTX Titan(D15U-50)0.06s  0.06s  don't work
-        GTX 680                  0.12s  0.154s               0.218s
+        GTX 680           0.11s  0.12s  0.154s               0.218s
        GTX 580           0.16s  0.16s  0.164s               0.203s
        GTX 480           0.19s  0.19s  0.192s               0.237s 0.27s
        GTX 470           0.23s  0.23s  0.238s               0.297s 0.34s

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -442,7 +442,7 @@ def local_gpu_lazy_ifelse(node):
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas._dot22])
+@local_optimizer([gpu_from_host, tensor.blas.Dot22])
 def local_gpu_dot22(node):
    """
    gpu_from_host(dot22) -> gpudot(gpu_from_host)
@@ -465,7 +465,7 @@ def local_gpu_dot22(node):
 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas._dot22scalar])
+@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):
    """
    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
@@ -571,7 +571,7 @@ def local_gpu_ger(node):
 @register_opt()
-@local_optimizer([tensor.blas.gemm_no_inplace, gpu_from_host])
+@local_optimizer([tensor.blas.Gemm, gpu_from_host])
 def local_gpu_gemm(node):
    """
    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -344,14 +344,15 @@ def local_gpua_advanced_incsubtensor(node):
 @register_opt()
-@op_lifter([tensor.CAReduce, tensor.Sum])
+@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
 def local_gpua_careduce(node):
-    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
+    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
-        isinstance(node.op.scalar_op, scalar.basic.Mul)):
+                                      scalar.Maximum, scalar.Minimum)):
        x, = node.inputs
-        greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis)
+        greduce = GpuCAReduceCuda(
-        if x.dtype != "float32":
+            node.op.scalar_op, axis=node.op.axis,
-            return
+            dtype=getattr(node.op, 'dtype', None),
+            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        #We need to have the make node called, otherwise the mask can
        #be None
@@ -384,10 +385,21 @@ def local_gpua_careduce(node):
                else:
                    new_mask.append(reduce_mask[i])
                    new_in_shp.append(x_shape[i])
+            new_axis = []
+            for idx, m in enumerate(new_mask):
+                if m == 1:
+                    new_axis.append(idx)
+            new_greduce = GpuCAReduceCuda(
+                node.op.scalar_op,
+                axis=new_axis, reduce_mask=new_mask,
+                dtype=getattr(node.op, 'dtype', None),
+                acc_dtype=getattr(node.op, 'acc_dtype', None))
-            new_greduce = GpuCAReduceCuda(new_mask, scalar_op)
            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
            gpu_reshaped_x = gpu_from_host(reshaped_x)
+            gvar = greduce(gpu_reshaped_x)
+            #We need to have the make node called, otherwise the mask can
+            #be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if new_greduce.supports_c_code(reshaped_gpu_inputs):
                reduce_reshaped_x = host_from_gpu(

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -2,9 +2,10 @@ from theano import scalar, gof
 from theano.gof.python25 import all, any
 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
-                                               test_CAReduce)
+                                               test_CAReduce, T_reduce_dtype)
-from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
+                                                          rand_gpuarray)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
                                              GpuCAReduceCuda, GpuCAReduceCPY)
 from theano.sandbox.gpuarray.type import GpuArrayType
@@ -47,6 +48,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
    def test_perform_nan(self):
        for dtype in self.dtypes:
+            if not dtype.startswith('float'):
+                continue
            for op in self.reds:
                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
                                 test_nan=True)
@@ -58,6 +61,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
    def test_c_nan(self):
        for dtype in self.dtypes:
+            if not dtype.startswith('float'):
+                continue
            for op in self.reds:
                self.with_linker(gof.CLinker(), op, dtype=dtype,
                                 test_nan=True)
@@ -68,9 +73,9 @@ class test_GpuCAReduceCPY(test_CAReduce):
 class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
-    dtypes = ["float32"]
+    dtypes = ["float32", "int64"]
    bin_dtypes = ["uint8", "int8"]
-    bin_dtypes = []
    cases = [((5, 6), None),
             ((5, 6), (0, 1)),
             ((5, 6), (0, )),
@@ -129,9 +134,10 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
-             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
+             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,2,3), [0,1,2,3]),#1111
             #test pattern implemented by reshape
+             #Skip them as this test the op directly, not the optimization with reshape
 #             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
 #             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
 #             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
@@ -140,10 +146,18 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 #             ((5,4,3,10,11),[1,2]),
        ]
    op = GpuCAReduceCuda
-    reds = [scalar.add, scalar.mul]
+    reds = [scalar.add, scalar.mul,
+            scalar.maximum, scalar.minimum]
    def test_perform(self):
        return
    def test_perform_nan(self):
        return
+class T_gpureduce_dtype(T_reduce_dtype):
+    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
+    op = GpuCAReduceCuda
+    #Currently we don't support reduction on 0 axis
+    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -46,16 +46,18 @@ def test_flatten():
                          for node in f.maker.fgraph.toposort()]
-def test_sum_prod():
+def test_reduce():
-    for method in ['sum']:
+    for method in ['sum', 'prod', 'max', 'min']:
        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(), mode=mode_with_gpu)
+        f = theano.function([m], getattr(m, method)(axis=0),
+                            mode=mode_with_gpu)
        val = numpy.random.rand(10, 11).astype("float32")
        res = f(val)
-        utt.assert_allclose(res, val.sum())
+        utt.assert_allclose(res, getattr(val, method)(axis=0))
-        assert res.shape == ()
+        assert res.shape == (11,)
+        topo = f.maker.fgraph.toposort()
        assert GpuCAReduceCuda in [type(node.op)
-                                   for node in f.maker.fgraph.toposort()]
+                                   for node in topo], topo
 def test_local_gpualloc_memset_0():

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -2335,7 +2335,10 @@ class Expm1(UnaryScalarOp):
    def c_code(self, node, name, (x, ), (z, ), sub):
        if node.inputs[0].type in complex_types:
            raise NotImplementedError('type not supported', type)
-        return "%(z)s = exp(%(x)s) - 1;" % locals()
+        return "%(z)s = expm1(%(x)s);" % locals()
+    def c_code_cache_version(self):
+        return (5,)
 expm1 = Expm1(upgrade_to_float, name='expm1')

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py