Merge pull request #947 from nouiz/gpu_reduce

Gpu reduce small change.

Merge pull request #947 from nouiz/gpu_reduce
6657d35b · goodfeli · 2196f4a4 · 4135b1d9 · 6657d35b · 6657d35b
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -373,7 +373,12 @@ def use(device,
            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
                theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
-
+            if device_properties(use.device_number)["warpSize"] != 32:
+                raise ValueError("Your GPU have a warpSize of 32. Currently"
+                                 " we have code that depend on this. Email"
+                                 " Theano mailing list to tell us about"
+                                 " this new GPU as we don't know any with"
+                                 " this properties")
            if move_shared_float32_to_gpu:
                handle_shared_float32(True)


--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -624,8 +624,8 @@ class GpuCAReduce(GpuOp):
        # but tensor.elemwise.CAReduce has this exact same check so I guess
        # this is OK to do
        if self.scalar_op in [scal.minimum, scal.maximum]:
+            conds = []
            for i in xrange(nd_in):
-                conds = []
                if self.reduce_mask[i]:
                    conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
            assert len(conds) > 0
@@ -723,7 +723,7 @@ class GpuCAReduce(GpuOp):

                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(float) * n_threads.x;
+                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
                                                n_shared>>>(
                        CudaNdarray_HOST_DIMS(%(x)s)[0],
@@ -862,11 +862,10 @@ class GpuCAReduce(GpuOp):
                extern __shared__ float buf[];
                float myresult = 0.0f;

+                //This is caught in cuda/init.py when we init the gpu. I keep
+                //it here to ease finding code that rely on this.
                if (warpSize != 32)
                {
-                    // TODO: set error code
-                    // 2012-09-20 IG: as of today, Fred says he will check
-                    // this elsewhere, in a different PR
                    Z[0] = -666;
                    return;
                }

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -42,7 +42,15 @@ def tes_use():
    tcn.use()


-def test_sum():
+def tensor_pattern_to_gpu_pattern(shape, pattern):
+    gpu_pattern = [0 for elem in shape]
+    for idx in pattern:
+        gpu_pattern[idx] = 1
+    gpu_pattern = tuple(gpu_pattern)
+    return gpu_pattern
+
+
+def test_careduce():
    """
    test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
    0011, 0101, 0111, 1011, 1111
@@ -56,363 +64,198 @@ def test_sum():

    TODO: test with broadcast
    """
-    for shape, pattern in [((1,1),(1,)),
-                           ((1,0),(1,)),
-                           ((0,1),(1,)),
-                           ((0,0),(1,)),
-                           ((0,0,0),(1,2)),
-                           ((0,0,0,0),(1,2,3)),
-                           ((2,1),(1,)),
-                           ((1,2),(1,)),
-                           ((100,3,1300),[1]),
-                           ((0,),[0]),((5,),[0]),
-                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
-                           ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
-                           ((0,0,0,0),[0,1,2,3]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
-                           ((5,4,3,10,11),[1,2]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
-
-                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
-                           ((4100,3),[0]),((3,4101),[0]),#10
-                           ((1024,33),[0]),((33,1024),[0]),#10
-                           ((1025,33),[0]),((33,1025),[0]),#10
-
-                           ((4100,3),[1]),((3,4101),[1]),#01
-                           ((1024,33),[1]),((33,1024),[1]),#01
-                           ((1025,33),[1]),((33,1025),[1]),#01
-
-                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
-                           ((1024,33),[0,1]),((33,1024),[0,1]),#01
-                           ((1025,33),[0,1]),((33,1025),[0,1]),#01
-
-                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
-                           ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
-                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
-                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
-                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
-                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
-                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
-
-                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
-                           ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
-                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
-                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
-                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
-
-
-                           #test pattern implemented by reshape
-                           ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-                           ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-                           ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-                           ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-
-                           ]:
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        b = T.Sum(pattern)(a)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a], b, mode=mode_with_gpu)
-        f2 = theano.function([a], b, mode=mode_without_gpu)
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
-        if val.size == 0:
-            assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern)
-        else:
-            try:
-                #We raise the error threashold as we sum big matrix
-                #and this cause small rounding difference with some seed
-                #example in debug mode with unittests.rseed=9275
-                orig_rtol = theano.tensor.basic.float32_rtol
-                theano.tensor.basic.float32_rtol = 2e-5
-                assert _allclose(f2(val), f(val)), ('shape', shape,
-                                                    'pattern', pattern,
-                                                    sum([shape[i] for i in pattern]),
-                                                    f2(val), f(val), val)
-            finally:
-                theano.tensor.basic.float32_rtol = orig_rtol
-
-
-        #test with dimshuffle
-        #we shuffle the 2 outer dims.
-    for shape, pattern in [#((5,),[0]),
-                           ((5,4),[0,1]),((5,4),[0]),
-                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
-                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        dim_pattern = range(len(shape))
-        dim_pattern[0] = 1
-        dim_pattern[1] = 0
-        a = a.dimshuffle(dim_pattern)
-        b = T.Sum(pattern)(a)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a], b, mode=mode_with_gpu)
-        f2 = theano.function([a], b, mode=mode_without_gpu)
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
-        assert _allclose(f2(val), f(val)), ('shape', shape,
-                                            'pattern', pattern,
-                                            sum([shape[i] for i in pattern]))
-
-
-        #test with broadcast
-    for shape, pattern in [((5,),[0]),
-                           ((5,4),[0,1]),((5,4),[0]),
-                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
-                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        shape = numpy.asarray(shape) * 2
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        a2 = tcn.CudaNdarrayType((False,) * len(shape))()
-        b = T.Sum(pattern)(a)
-        b2 = T.Sum(pattern)(a2)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        val2 = cuda.CudaNdarray(val)
-        if len(shape) == 1:
-            val = val[::2]
-            val2 = val2[::2]
-        elif len(shape) == 2:
-            val = val[::2, ::2]
-            val2 = val2[::2, ::2]
-        elif len(shape) == 3:
-            val = val[::2, ::2, ::2]
-            val2 = val2[::2, ::2, ::2]
-        elif len(shape) == 4:
-            val = val[::2, ::2, ::2, ::2]
-            val2 = val2[::2, ::2, ::2, ::2]
-        f = theano.function([a], b, mode=mode_without_gpu)
-        f2 = theano.function([a2], b2, mode=mode_with_gpu)
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
-        assert T.Sum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert _allclose(f2(val2), f(val)), ('shape', shape,
-                                             'pattern', pattern,
-                                             sum([shape[i] for i in pattern]))
-
-def test_max():
-    """
-    test GpuMax pattern 01, 011, 0111 (tensor.max pattern (1,), (1,2), (1,2,3) )
+    for scalar_op in [theano.scalar.add, theano.scalar.maximum]:
+        for shape, pattern in [((1,1),(1,)),
+                               ((1,0),(1,)),
+                               ((0,1),(1,)),
+                               ((0,0),(1,)),
+                               ((0,0,0),(1,2)),
+                               ((0,0,0,0),(1,2,3)),
+                               ((2,1),(1,)),
+                               ((1,2),(1,)),
+                               ((100,3,1300),[1]),
+                               ((0,),[0]),((5,),[0]),
+                               ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
+                               ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
+                               ((0,0,0,0),[0,1,2,3]),
+                               ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                               ((5,4,3,10,11),[1,2]),
+                               ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+
+                               #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
+                               ((4100,3),[0]),((3,4101),[0]),#10
+                               ((1024,33),[0]),((33,1024),[0]),#10
+                               ((1025,33),[0]),((33,1025),[0]),#10
+
+                               ((4100,3),[1]),((3,4101),[1]),#01
+                               ((1024,33),[1]),((33,1024),[1]),#01
+                               ((1025,33),[1]),((33,1025),[1]),#01
+
+                               ((4100,3),[0,1]),((3,4101),[0,1]),#11
+                               ((1024,33),[0,1]),((33,1024),[0,1]),#01
+                               ((1025,33),[0,1]),((33,1025),[0,1]),#01
+
+                               ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
+                               ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
+                               ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+                               ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+                               ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+                               #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+                               ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+
+                               ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+                               ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
+                               ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+                               ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
+
+
+                               #test pattern implemented by reshape
+                               ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+                               ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+                               ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+                               ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+                               ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+
+                               ]:
+
+            op = tensor.CAReduce(scalar_op, axis=pattern)
+            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum} support only those patterns
+            if scalar_op is theano.scalar.maximum and pat not in [
+                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                continue

-    TODO: are others currently implemented by reshape?
-    """
+            a = tensor.TensorType('float32', (False,) * len(shape))()
+            b = op(a)
+            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+    #        val = numpy.ones(shape)
+    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+            val = theano._asarray(val, dtype='float32')
+            f = theano.function([a], b, mode=mode_with_gpu)
+            f2 = theano.function([a], b, mode=mode_without_gpu)
+            assert tcn.GpuCAReduce in [x.op.__class__
+                                       for x in f.maker.fgraph.toposort()]
+            assert op.__class__ in [x.op.__class__
+                                    for x in f2.maker.fgraph.toposort()]
+            f_caused_value_error = False
+            try:
+                f_out = f(val)
+            except ValueError, e:
+                exc = e
+                f_caused_value_error = True

-    def tensor_pattern_to_gpu_pattern(shape, pattern):
-        gpu_pattern = [ 0 for elem in shape ]
-        for idx in pattern:
-            gpu_pattern[idx] = 1
-        gpu_pattern = tuple(gpu_pattern)
-        return gpu_pattern
-
-    known_fail = False
-
-    for shape, pattern in [((1,1),(1,)),
-                           ((1,0),(1,)),
-                           ((0,1),(1,)),
-                           ((0,0),(1,)),
-                           ((0,0,0),(1,2)),
-                           ((0,0,0,0),(1,2,3)),
-                           ((2,1),(1,)),
-                           ((1,2),(1,)),
-                           ((100,3,1300),[1]),
-                           ((0,),[0]),((5,),[0]),
-                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
-                           ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
-                           ((0,0,0,0),[0,1,2,3]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
-                           ((5,4,3,10,11),[1,2]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
-
-                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
-                           ((4100,3),[0]),((3,4101),[0]),#10
-                           ((1024,33),[0]),((33,1024),[0]),#10
-                           ((1025,33),[0]),((33,1025),[0]),#10
-
-                           ((4100,3),[1]),((3,4101),[1]),#01
-                           ((1024,33),[1]),((33,1024),[1]),#01
-                           ((1025,33),[1]),((33,1025),[1]),#01
-
-                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
-                           ((1024,33),[0,1]),((33,1024),[0,1]),#01
-                           ((1025,33),[0,1]),((33,1025),[0,1]),#01
-
-                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
-                           ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
-                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
-                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
-                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
-                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
-                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
-
-                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
-                           ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
-                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
-                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
-                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
-
-
-                           #test pattern implemented by reshape
-                           ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
-                           ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
-                           ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
-                           ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
-                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
-
-                           ]:
-        # Don't test patterns that aren't implemented for max yet
-        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
-            [ (0,1), (0,1,1), (0,1,1) ]:
-            continue
-
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        b = T.max(a, pattern)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a], b, mode=mode_with_gpu)
-        f_caused_value_error = False
-        try:
-            f_out = f(val)
-        except ValueError, e:
-            exc = e
-            f_caused_value_error = True
-
-        f2 = theano.function([a], b, mode=mode_without_gpu)
-        try:
-            f2_out = f2(val)
            f2_caused_value_error = False
-        except ValueError, e:
-            exc2 = e
-            f2_caused_value_error = True
-
-
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
+            try:
+                f2_out = f2(val)
+            except ValueError, e:
+                exc2 = e
+                f2_caused_value_error = True
+
+            if f_caused_value_error != f2_caused_value_error:
+                if f_caused_value_error:
+                    print 'f caused this value error:'
+                    print exc
+                else:
+                    print 'f did not raise a value error, but should have'
+                if f2_caused_value_error:
+                    print 'f2 caused this value error:'
+                    print exc2
+                else:
+                    print 'f should not have raised a value error'
+                print 'shape was: ', shape
+                print 'pattern was: ', pattern
+                assert False

-        # Check that 0 shape matrices are invalid in the same cases
-        if f_caused_value_error != f2_caused_value_error:
-            if f_caused_value_error:
-                print 'f caused this value error:'
-                print exc
-            else:
-                print 'f did not raise a value error, but should have'
-            if f2_caused_value_error:
-                print 'f2 caused this value error:'
-                print exc2
-            else:
-                print 'f should not have raised a value error'
-            print 'shape was: ',shape
-            print 'pattern was: ',pattern
-            assert False
-
-        if f_caused_value_error:
-            continue
-
-        if val.size == 0:
-            assert f2(val).size == f(val).size
-            assert f2(val).shape == f(val).shape
-        else:
            try:
                #We raise the error threashold as we sum big matrix
                #and this cause small rounding difference with some seed
                #example in debug mode with unittests.rseed=9275
                orig_rtol = theano.tensor.basic.float32_rtol
                theano.tensor.basic.float32_rtol = 2e-5
-                f2_val = f2(val)
-                f_val = f(val)
-                if not _allclose(f2_val, f_val):
-                    print 'failed for the following arguments: '
-                    print 'shape:',shape
-                    print 'pattern: ',pattern
-                    print 'input:'
-                    print val
-                    print 'correct output: '
-                    print f2_val
-                    print 'actual output: '
-                    print f_val
-                    assert False
+                assert _allclose(f_out, f2_out), ('shape', shape,
+                                                    'pattern', pattern,
+                                                    sum([shape[i] for i in pattern]),
+                                                    f2(val), f(val), val)
            finally:
                theano.tensor.basic.float32_rtol = orig_rtol


-        #test with dimshuffle
-        #we shuffle the 2 outer dims.
-    for shape, pattern in [#((5,),[0]),
-                           ((5,4),(0,1)),((5,4),[0]),
-                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
-                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
-        # Don't test patterns that aren't implemented for max yet
-        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
-            [ (0,1), (0,1,1), (0,1,1) ]:
-            continue
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        dim_pattern = range(len(shape))
-        dim_pattern[0] = 1
-        dim_pattern[1] = 0
-        a = a.dimshuffle(dim_pattern)
-        b = T.max(a, pattern)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        f = theano.function([a], b, mode=mode_with_gpu)
-        f2 = theano.function([a], b, mode=mode_without_gpu)
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
-        assert _allclose(f2(val), f(val)), ('shape', shape,
-                                            'pattern', pattern,
-                                            sum([shape[i] for i in pattern]))
-
-        #test with broadcast
-    for shape, pattern in [((5,),(0,)),
-                           ((5,4),(0,1)),
-                           ((5,4),(0,)),
-                           ((5,4,3),(0,)),
-                           ((5,4,3),(0,1)),
-                           ((5,4,3),(2,)),
-                           ((5,4,3),(0,1,2)),
-                           ((5,4,3,2),(0,1,2,3)),
-                           ((5,4,3,2),(0,2,3))]:
-        # Don't test patterns that aren't implemented for max yet
-        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
-            [ (0,1), (0,1,1), (0,1,1) ]:
-            continue
-        shape = numpy.asarray(shape) * 2
-        a = tensor.TensorType('float32', (False,) * len(shape))()
-        a2 = tcn.CudaNdarrayType((False,) * len(shape))()
-        b = T.max(a, pattern)
-        b2 = T.max(a2, pattern)
-        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
-#        val = numpy.ones(shape)
-#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
-        val = theano._asarray(val, dtype='float32')
-        val2 = cuda.CudaNdarray(val)
-        if len(shape) == 1:
-            val = val[::2]
-            val2 = val2[::2]
-        elif len(shape) == 2:
-            val = val[::2, ::2]
-            val2 = val2[::2, ::2]
-        elif len(shape) == 3:
-            val = val[::2, ::2, ::2]
-            val2 = val2[::2, ::2, ::2]
-        elif len(shape) == 4:
-            val = val[::2, ::2, ::2, ::2]
-            val2 = val2[::2, ::2, ::2, ::2]
-        f = theano.function([a], b, mode=mode_without_gpu)
-        f2 = theano.function([a2], b2, mode=mode_with_gpu)
-        assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
-        assert T.CAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
-        assert _allclose(f2(val2), f(val)), ('shape', shape,
-                                             'pattern', pattern,
-                                             sum([shape[i] for i in pattern]))
+            #test with dimshuffle
+            #we shuffle the 2 outer dims.
+        for shape, pattern in [#((5,),[0]),
+                               ((5,4),[0,1]),((5,4),[0]),
+                               ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
+                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
+            op = tensor.CAReduce(scalar_op, axis=pattern)
+            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum} support only those patterns
+            if scalar_op is theano.scalar.maximum and pat not in [
+                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                continue
+            a = tensor.TensorType('float32', (False,) * len(shape))()
+            dim_pattern = range(len(shape))
+            dim_pattern[0] = 1
+            dim_pattern[1] = 0
+            a = a.dimshuffle(dim_pattern)
+            b = op(a)
+            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+    #        val = numpy.ones(shape)
+    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+            val = theano._asarray(val, dtype='float32')
+            f = theano.function([a], b, mode=mode_with_gpu)
+            f2 = theano.function([a], b, mode=mode_without_gpu)
+            assert tcn.GpuCAReduce in [x.op.__class__
+                                       for x in f.maker.fgraph.toposort()]
+            assert op.__class__ in [x.op.__class__
+                                    for x in f2.maker.fgraph.toposort()]
+            assert _allclose(f2(val), f(val)), ('shape', shape,
+                                                'pattern', pattern,
+                                                sum([shape[i] for i in pattern]))
+
+            #test with broadcast
+        for shape, pattern in [((5,),[0]),
+                               ((5,4),[0,1]),((5,4),[0]),
+                               ((5,4,3),[0]),((5,4,3),[0,1]),
+                               ((5,4,3),[2]),((5,4,3),[0,1,2]),
+                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
+            op = tensor.CAReduce(scalar_op, axis=pattern)
+            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
+            #GpuCAReduce{maximum} support only those patterns
+            if scalar_op is theano.scalar.maximum and pat not in [
+                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                continue
+            shape = numpy.asarray(shape) * 2
+            a = tensor.TensorType('float32', (False,) * len(shape))()
+            a2 = tcn.CudaNdarrayType((False,) * len(shape))()
+            b = op(a)
+            b2 = op(a2)
+            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+    #        val = numpy.ones(shape)
+    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+            val = theano._asarray(val, dtype='float32')
+            val2 = cuda.CudaNdarray(val)
+            if len(shape) == 1:
+                val = val[::2]
+                val2 = val2[::2]
+            elif len(shape) == 2:
+                val = val[::2, ::2]
+                val2 = val2[::2, ::2]
+            elif len(shape) == 3:
+                val = val[::2, ::2, ::2]
+                val2 = val2[::2, ::2, ::2]
+            elif len(shape) == 4:
+                val = val[::2, ::2, ::2, ::2]
+                val2 = val2[::2, ::2, ::2, ::2]
+            f = theano.function([a], b, mode=mode_without_gpu)
+            f2 = theano.function([a2], b2, mode=mode_with_gpu)
+            assert tcn.GpuCAReduce in [x.op.__class__
+                                       for x in f2.maker.fgraph.toposort()]
+            assert op.__class__ in [x.op.__class__
+                                    for x in f.maker.fgraph.toposort()]
+            assert _allclose(f2(val2), f(val)), ('shape', shape,
+                                                 'pattern', pattern,
+                                                 sum([shape[i] for i in pattern]))


 def test_flatten():