Merge pull request #945 from nouiz/gpu_reduce_shape0

Gpu reduce shape0

Merge pull request #945 from nouiz/gpu_reduce_shape0
2196f4a4 · goodfeli · 91c8fcf3 · da11f0f4 · 2196f4a4 · 2196f4a4
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -684,7 +684,10 @@ class GpuCAReduce(GpuOp):
        # \begin bracket the reduction in a check that there is
        # actually work to do
        print >> sio, """
-        if (CudaNdarray_SIZE(%(z)s))
+        if (CudaNdarray_SIZE(%(z)s) && ! CudaNdarray_SIZE(%(x)s)){
+            cudaMemset(%(z)s->devdata, 0, CudaNdarray_SIZE(%(z)s) * sizeof(float));
+        }
+        else if (CudaNdarray_SIZE(%(z)s))
        {
        """ % locals()

@@ -1553,13 +1556,13 @@ class GpuCAReduce(GpuOp):
        """ % locals()

    def c_code_cache_version_apply(self, node):
-        version = [5]  # the version corresponding to the c code in this Op
+        version = [6]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
-        version.extend(self.scalar_op.c_code_cache_version(scalar_node))
+        version.extend(self.scalar_op.c_code_cache_version())
        for i in node.inputs + node.outputs:
            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
        if all(version):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -56,7 +56,15 @@ def test_sum():

    TODO: test with broadcast
    """
-    for shape, pattern in [((100,3,1300),[1]),
+    for shape, pattern in [((1,1),(1,)),
+                           ((1,0),(1,)),
+                           ((0,1),(1,)),
+                           ((0,0),(1,)),
+                           ((0,0,0),(1,2)),
+                           ((0,0,0,0),(1,2,3)),
+                           ((2,1),(1,)),
+                           ((1,2),(1,)),
+                           ((100,3,1300),[1]),
                           ((0,),[0]),((5,),[0]),
                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
                           ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
@@ -112,7 +120,7 @@ def test_sum():
        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
        if val.size == 0:
-            assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
+            assert _allclose(f2(val), f(val)), ('shape', shape, 'pattern', pattern)
        else:
            try:
                #We raise the error threashold as we sum big matrix
@@ -275,16 +283,6 @@ def test_max():
        except ValueError, e:
            exc = e
            f_caused_value_error = True
-        except RuntimeError:
-            if (shape, pattern) in [((1,0),(1,)),
-                                  ((0,1),(1,)),
-                                  ((0,0),(1,)),
-                                  ((0,0,0),(1,2)),
-                                  ((0,0,0,0),(1,2,3))]:
-                known_fail = True
-                continue
-            else:
-                raise

        f2 = theano.function([a], b, mode=mode_without_gpu)
        try:
@@ -372,7 +370,6 @@ def test_max():
                                            'pattern', pattern,
                                            sum([shape[i] for i in pattern]))

-
        #test with broadcast
    for shape, pattern in [((5,),(0,)),
                           ((5,4),(0,1)),
@@ -417,9 +414,6 @@ def test_max():
                                             'pattern', pattern,
                                             sum([shape[i] for i in pattern]))

-    if known_fail:
-        raise KnownFailureTest("GpuCAReduce does not handle some shapes"
-                " with 0s in them correctly.")

 def test_flatten():
    x = cuda.fmatrix('x')