add gpu max for pattern (0, 1) and added all gpu max pattern for gpu min.

01b8c32e · Frederic · 9a4181b6 · 01b8c32e · 01b8c32e · 01b8c32e
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1189,7 +1189,9 @@ class GpuCAReduce(GpuOp):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
    def c_code_reduce_10(self, sio, node, name, x, z, fail):
-        self._op_guard()
+        if not isinstance(self.scalar_op, (scal.Add, scal.Minimum,
+                                           scal.Maximum)):
+            raise NotImplementedError()
        print >> sio, """
        {
            int verbose = 0;
@@ -1759,9 +1761,12 @@ class GpuCAReduce(GpuOp):
                    }
                }
                """ % locals()
-            elif isinstance(self.scalar_op, scal.Maximum):
+            elif isinstance(self.scalar_op, (scal.Maximum, scal.Minimum)):
                # special cased max code (special case because visits first
                # member of each row twice)
+                reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                                 "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
+                                                 {})
                print >> sio, """
                %(decl)s{
                    %(init)s
@@ -1770,8 +1775,7 @@ class GpuCAReduce(GpuOp):
                      %(for_i1)s{
                        %(for_i2)s{
                          %(for_i3)s{
-                            float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
+                            %(reduce_fct)s;
-                            myresult = max(myresult, Ai);
                          }
                        }
                      }
@@ -1791,14 +1795,24 @@ class GpuCAReduce(GpuOp):
                # code to make sure it does not cause a slowdown
                raise NotImplementedError()
        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            self._op_guard()
+            if not isinstance(self.scalar_op, (scal.Add, scal.Minimum,
+                                               scal.Maximum)):
+                raise NotImplementedError()
            # this kernel uses one block for each column,
            # threads per block for each element per column.
            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', node, nodename, sub = {})
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
+                                           node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            if isinstance(self.scalar_op, scal.Add):
+                reduce_init = "0.f;"
+            else:
+                reduce_init = "A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2];"
            print >> sio, """
            static __global__ void kernel_reduce_010_%(nodename)s(
                    const int d0,
@@ -1822,10 +1836,10 @@ class GpuCAReduce(GpuOp):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = 0.0f;
+                        float myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
-                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            %(reduce_fct)s;
                        }
                        %(reducebuf)s
                    }
@@ -2307,6 +2321,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            return ()
        return (3, hv)
 class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -597,7 +597,7 @@ def local_gpu_careduce(node):
        scalar_op = node.op.scalar_op
        # currently, only these two ops are supported at all,
        # and max does not support all combinations of axes
-        if node.op.scalar_op in [scal.add, scal.maximum]:
+        if node.op.scalar_op in [scal.add, scal.maximum, scal.minimum]:
            x, = node.inputs
            if x.owner and x.owner.op == host_from_gpu:
                if node.op.axis is None:

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -66,7 +66,8 @@ def test_careduce():
    """
    for scalar_op, careduce_op in [
            (theano.scalar.add, tensor.elemwise.CAReduceDtype),
-            (theano.scalar.maximum, tensor.CAReduce)]:
+            (theano.scalar.maximum, tensor.CAReduce),
+            (theano.scalar.minimum, tensor.CAReduce)]:
        for shape, pattern in [((1,1),(1,)),
                               ((1,0),(1,)),
                               ((0,1),(1,)),
@@ -123,9 +124,10 @@ def test_careduce():
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            a = tensor.TensorType('float32', (False,) * len(shape))()
@@ -191,10 +193,12 @@ def test_careduce():
                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            a = tensor.TensorType('float32', (False,) * len(shape))()
            dim_pattern = range(len(shape))
            dim_pattern[0] = 1
@@ -223,10 +227,12 @@ def test_careduce():
                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
-            #GpuCAReduce{maximum} support only those patterns
+            #GpuCAReduce{maximum/minimum} support only those patterns
-            if scalar_op is theano.scalar.maximum and pat not in [
+            if scalar_op in [theano.scalar.maximum,
-                (0, 1), (0, 1, 1), (0, 1, 1)]:
+                             theano.scalar.minimum] and pat not in [
+                                 (0, 1), (0, 1, 1), (0, 1, 1), (1, 0)]:
                continue
            shape = numpy.asarray(shape) * 2
            a = tensor.TensorType('float32', (False,) * len(shape))()
            a2 = tcn.CudaNdarrayType((False,) * len(shape))()