Merge pull request #934 from goodfeli/gpu_max

Gpu max

Merge pull request #934 from goodfeli/gpu_max
91c8fcf3 · nouiz · a5965456 · c3d358c5 · 91c8fcf3 · 91c8fcf3
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type:
      This must return C code that carries the computation we want to do.
+      sub is a dictionary of strings for you to substitute into your code.
+      It's not clear if it ever contains anything other than 'fail'.
+      sub['fail'] is a string of code that you should execute (after calling
+      PyErr_Format) if your C code needs to raise an exception.
    .. method:: c_code_cleanup(node, name, input_names, output_names, sub)
      This must return C code that cleans up whatever c_code allocated and

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -270,7 +270,7 @@ if cuda_available:
    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
-                           GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
+                           GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
                           GpuSubtensor, GpuIncSubtensor,
                           GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
                           GpuFlatten, GpuShape, GpuAlloc,

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -582,9 +582,12 @@ def local_gpu_gemm(node):
 @register_opt()
 @local_optimizer([])
-def local_gpu_sum(node):
+def local_gpu_careduce(node):
    if isinstance(node.op, tensor.elemwise.CAReduce):
-        if node.op.scalar_op == scal.add:
+        scalar_op = node.op.scalar_op
+        # currently, only these two ops are supported at all,
+        # and max does not support all combinations of axes
+        if node.op.scalar_op in [scal.add, scal.maximum]:
            x, = node.inputs
            if x.owner and x.owner.op == host_from_gpu:
                if node.op.axis is None:
@@ -594,22 +597,21 @@ def local_gpu_sum(node):
                    for a in node.op.axis:
                        assert reduce_mask[a] == 0
                        reduce_mask[a] = 1
-                gsum = GpuSum(reduce_mask)
+                greduce = GpuCAReduce(reduce_mask, scalar_op)
-                pattern = (''.join(str(i) for i in reduce_mask))
+                if greduce.supports_c_code([gpu_from_host(x)]):
-                if hasattr(gsum, 'c_code_reduce_%s' % pattern):
+                    rval = host_from_gpu(greduce(gpu_from_host(x)))
-                    rval = host_from_gpu(gsum(gpu_from_host(x)))
                    if rval.type == node.outputs[0].type:
                        return [rval]
                    else:
                        print >> sys.stderr, \
-                                "WARNING: local_gpu_sum got type wrong"
+                                "WARNING: local_gpu_careduce got type wrong"
                        return None
                else:
                    # Try to make a simpler pattern based on reshaping
                    # The principle is that if two adjacent dimensions have
                    # the same value in the reduce_mask, then we can reshape
-                    # to make them a single dimension, do the sum, and then
+                    # to make them a single dimension, do the reduction, and then
                    # reshape to get them back.
                    shape_of = node.fgraph.shape_feature.shape_of
@@ -625,27 +627,28 @@ def local_gpu_sum(node):
                            new_mask.append(reduce_mask[i])
                            new_in_shp.append(x_shape[i])
-                    pattern = (''.join(str(i) for i in new_mask))
+                    new_greduce = GpuCAReduce(new_mask, scalar_op)
-                    new_gsum = GpuSum(new_mask)
+                    reshaped_x = x.reshape(tensor.stack(*new_in_shp))
-                    if hasattr(new_gsum, 'c_code_reduce_%s' % pattern):
+                    gpu_reshaped_x = gpu_from_host(reshaped_x)
-                        reshaped_x = x.reshape(tensor.stack(*new_in_shp))
+                    reshaped_gpu_inputs = [ gpu_reshaped_x ]
-                        sum_reshaped_x = host_from_gpu(
+                    if new_greduce.supports_c_code(reshaped_gpu_inputs):
-                            new_gsum(gpu_from_host(reshaped_x)))
+                        reduce_reshaped_x = host_from_gpu(
+                            new_greduce(gpu_reshaped_x))
-                        if sum_reshaped_x.ndim != node.outputs[0].ndim:
+                        if reduce_reshaped_x.ndim != node.outputs[0].ndim:
-                            unreshaped_sum = sum_reshaped_x.reshape(
+                            unreshaped_reduce = reduce_reshaped_x.reshape(
                                tensor.stack(*shape_of[node.outputs[0]]))
                        else:
-                            unreshaped_sum = sum_reshaped_x
+                            unreshaped_reduce = reduce_reshaped_x
-                        if unreshaped_sum.type == node.outputs[0].type:
+                        if unreshaped_reduce.type == node.outputs[0].type:
-                            return [unreshaped_sum]
+                            return [unreshaped_reduce]
                        else:
                            print >> sys.stderr, \
-                                    "WARNING: local_gpu_sum got type wrong"
+                                    "WARNING: local_gpu_careduce got type wrong"
                            return None
                        raise Exception(
-                            "GpuSum don't have implemented the pattern",
+                                "GpuCAReduce does not yet implement this pattern:",
                            pattern)
    return False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -28,7 +28,10 @@ def test_nvidia_driver1():
                        profile=False)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
-    assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
+    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
+        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
+            'but got:']+[str(app) for app in topo])
+        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "
                        "does not give good results for reduction."

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -44,11 +44,11 @@ def test_int_pow():
    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu']
+    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
    f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu']
+    assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
    #theano.printing.debugprint(f)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp):
        gx = eq(output, x) * gz
        gy = eq(output, y) * gz
        return (gx, gy)
 maximum = Maximum(upcast_out, name='maximum')
@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp):
        gx = eq(output, x) * gz
        gy = eq(output, y) * gz
        return (gx, gy)
 minimum = Minimum(upcast_out, name='minimum')
@@ -1222,6 +1222,8 @@ class Add(ScalarOp):
            for i in inputs:
                    retval += [gz]
        return retval
 add = Add(upcast_out, name='add')

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1082,14 +1082,16 @@ class Elemwise(Op):
 class CAReduce(Op):
    """
+    CAReduce = Commutative Associative Reduce
    Reduces a scalar operation along the specified axis(es).
+    (The scalar op should be both commutative and assocative)
    The output will have the same shape as the input minus the reduced
    dimensions. It will contain the variable of accumulating all values
    over the reduced dimensions using the specified scalar op.
    Examples:
-     CAReduce(add) -> sum
+     CAReduce(add) -> sum (ie, acts like the numpy sum operation)
     CAReduce(mul) -> product
     CAReduce(maximum) -> max
     CAReduce(minimum) -> min