Merge pull request #934 from goodfeli/gpu_max

Gpu max

Merge pull request #934 from goodfeli/gpu_max
91c8fcf3 · nouiz · a5965456 · c3d358c5 · 91c8fcf3 · 91c8fcf3
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type:

      This must return C code that carries the computation we want to do.

+      sub is a dictionary of strings for you to substitute into your code.
+      It's not clear if it ever contains anything other than 'fail'.
+      sub['fail'] is a string of code that you should execute (after calling
+      PyErr_Format) if your C code needs to raise an exception.
+
    .. method:: c_code_cleanup(node, name, input_names, output_names, sub)

      This must return C code that cleans up whatever c_code allocated and

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -270,7 +270,7 @@ if cuda_available:

    import basic_ops
    from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
-                           GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
+                           GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
                           GpuSubtensor, GpuIncSubtensor,
                           GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
                           GpuFlatten, GpuShape, GpuAlloc,

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -8,6 +8,8 @@ import numpy
 import theano
 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar, config
+from theano.scalar import Scalar
+scal = scalar # somewhere scalar gets reassigned to be a function

 from theano.gof.python25 import all, any

@@ -484,15 +486,15 @@ class GpuDimShuffle(GpuOp):
        return (1, 0)


-class GpuSum(GpuOp):
-    """GpuSum is a Reduction along some dimensions by summation.
+class GpuCAReduce(GpuOp):
+    """GpuCAReduce is a Reduction along some dimensions by a scalar op.

-    The dimensions along which to sum is specified by the
+    The dimensions along which to reduce is specified by the
    `reduce_mask` that you pass to the constructor.  The `reduce_mask`
    is a tuple of booleans (actually integers 0 or 1) that specify for
    each input dimension, whether to reduce it (1) or not (0).

-    For example:
+    For example, when scalar_op is a theano.scalar.basic.Add instance:

      - reduce_mask == (1,) sums a vector to a scalar

@@ -505,19 +507,39 @@ class GpuSum(GpuOp):
    :note: any reduce_mask of all zeros is a sort of 'copy', and may
           be removed during graph optimization

+    This Op is a work in progress.
+
+    This op was recently upgraded from just GpuSum a general CAReduce. Not
+    many code cases are supported for scalar_op being anything other than
+    scal.Add instances yet.
+
+    Important note: if you implement new cases for this op, be sure to
+    benchmark them and make sure that they actually result in a speedup.
+    GPUs are not especially well-suited to reduction operations so it is
+    quite possible that the GPU might be slower for some cases.
+
    """
-    def __init__(self, reduce_mask):
+
+    def __init__(self, reduce_mask, scalar_op):
        self.reduce_mask = tuple(reduce_mask)
+        self.scalar_op = scalar_op
+        # used to make sure that calls to scalar op
+        # have unique name arguments
+        self._n_scalar_op_calls = 0

    def __eq__(self, other):
        return (type(self) == type(other) and
-                self.reduce_mask == other.reduce_mask)
+                self.reduce_mask == other.reduce_mask and
+                self.scalar_op == other.scalar_op)

    def __hash__(self):
-        return hash(type(self)) ^ hash(self.reduce_mask)
+        return hash(type(self)) ^ hash(self.reduce_mask) ^ hash(type(self.scalar_op))

    def __str__(self):
-        return "GpuSum{%s}" % ','.join(str(i) for i in self.reduce_mask)
+        return "GpuCAReduce{%s}{%s}" % (
+                str(self.scalar_op),
+                ','.join(str(i) for i in self.reduce_mask)
+                )

    def make_node(self, x):
        if (x.type.ndim != len(self.reduce_mask)):
@@ -526,10 +548,55 @@ class GpuSum(GpuOp):
                       in xrange(x.type.ndim) if not self.reduce_mask[i]]
        return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])

+    """
+    This method must be commented, because there's no way
+    to communicate that it's OK to call for + but not for
+    max
    def perform(self, node, inp, out):
        x, = inp
        z, = out
+        self._op_guard()
+        # reduce_max is declared but does nothing but
+        # raise NotImplementedError.
+        # We can't call it here anyway because it hasn't
+        # been added to the python bindings yet
        z[0] = x.reduce_sum(self.reduce_mask)
+    """
+
+    def supports_c_code(self, inputs):
+        """ Returns True if the current op and reduce pattern
+            has functioning C code """
+
+        # If we don't even have the right method, we certainly
+        # don't support the C code
+        # (This is the test that used to be implemented by
+        # local_gpu_sum)
+        pattern = (''.join(str(i) for i in self.reduce_mask))
+        if not hasattr(self, 'c_code_reduce_%s' % pattern):
+            return False
+
+        # Now that this is a general reduction op, we might
+        # have a method for a pattern, but that pattern
+        # might not be implemented for the current scalar op.
+        # To detect this more complicated situation, we
+        # make fake arguments to c_code, try to run them,
+        # and see if NotImplementedError gets raised.
+
+        node = self.make_node(*inputs)
+
+        name = 'fake_name'
+
+        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
+        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
+
+        sub = { 'fail' : 'fake failure code' }
+
+        try:
+            self.c_code(node, name, inp, out, sub)
+            self.c_support_code_apply(node, name)
+        except NotImplementedError:
+            return False
+        return True

    def c_code(self, node, name, inp, out, sub):
        x, = inp
@@ -553,6 +620,24 @@ class GpuSum(GpuOp):
        }
        """ % locals()

+        # It might be nice to use a property of the op class to do this,
+        # but tensor.elemwise.CAReduce has this exact same check so I guess
+        # this is OK to do
+        if self.scalar_op in [scal.minimum, scal.maximum]:
+            for i in xrange(nd_in):
+                conds = []
+                if self.reduce_mask[i]:
+                    conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
+            assert len(conds) > 0
+            cond = "(" + " || ".join(conds) + ")"
+            print >> sio, """
+            if %(cond)s
+            {
+                PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
+                %(fail)s;
+            }
+            """ %locals()
+
        #
        # alloc an output if we need one
        #
@@ -608,7 +693,7 @@ class GpuSum(GpuOp):
        #

        if all(i == 1 for i in self.reduce_mask):
-            #check if the tensor is ccontiguous, if true, use the c_c0de_reduce_ccontig code.
+            #check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
            #TODO: check if we are ccontiguous when we un-dimshuffle
            #TODO: if only some dims are ccontiguous, call version with less dims.
            print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
@@ -634,9 +719,9 @@ class GpuSum(GpuOp):
            .. code-block:: c

                if (verbose)
-                    printf("running kernel_reduce_sum_10_%(name)s\\n");
+                    printf("running kernel_reduce_10_%(name)s\\n");
                int n_shared = sizeof(float) * n_threads.x;
-                kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads,
+                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
                                                n_shared>>>(
                        CudaNdarray_HOST_DIMS(%(x)s)[0],
                        CudaNdarray_HOST_DIMS(%(x)s)[1],
@@ -660,7 +745,7 @@ class GpuSum(GpuOp):
        nd_out = ndim - sum(self.reduce_mask)
        print >> sio, """
            if (verbose)
-                printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
+                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
            if (verbose>1)
                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
@@ -670,7 +755,7 @@ class GpuSum(GpuOp):
                                  n_threads.x*n_threads.y*n_threads.z,
                                  n_blocks.x,n_blocks.y,
                                  n_blocks.x*n_blocks.y, n_shared);
-            kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+            kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
            """ % locals()
        for i in xrange(ndim):
            print >> sio, """
@@ -699,7 +784,7 @@ class GpuSum(GpuOp):
                PyErr_Format(PyExc_RuntimeError,
                    "Cuda error: %%s: %%s."
                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_%(pattern)s_%(name)s",
+                    "kernel_reduce_%(pattern)s_%(name)s",
                    cudaGetErrorString(sts),
                    n_blocks.x,
                    n_blocks.y,
@@ -715,9 +800,11 @@ class GpuSum(GpuOp):
                ndim=None, reduce_mask=None):
        """Return a string to declare a kernel function

+        The result will look something like this:
+
        .. code-block:: c

-            static __global__ void kernel_reduce_sum_110_%(nodename)s(
+            static __global__ void kernel_reduce_110_%(nodename)s(
                    const int d0,
                    const int d1,
                    const int d2,
@@ -728,7 +815,10 @@ class GpuSum(GpuOp):
                    float * Z,
                    const int sZ0)

-        """ % locals()
+            Since the nodename is unique, we don't need to put the name
+            of the scalar_op in here.
+
+        """
        if reduce_mask is None:
            reduce_mask = self.reduce_mask
        if ndim is None:
@@ -738,7 +828,7 @@ class GpuSum(GpuOp):
        sio = StringIO.StringIO()

        print >> sio, """
-            static __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s(
+            static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
        """ % locals()
        for i in xrange(ndim):
            print >> sio, """
@@ -764,39 +854,77 @@ class GpuSum(GpuOp):
    def _k_init(self, *args):
        return """
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+                + threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
-                    //TODO: set error code
+                    // TODO: set error code
+                    // 2012-09-20 IG: as of today, Fred says he will check
+                    // this elsewhere, in a different PR
                    Z[0] = -666;
                    return;
                }

        """

-    def _k_reduce_buf(self, z_pos):
-        # Work with all nvidia driver
-        # But only for power or multiple of 2!
+    def _assign_reduce(self, node, name, left, right, sub):
+        """
+            node: the node argument to this op's c_code
+            name: the name argument to this op's c_code
+            left: a C code string identifying an lvalue
+            right: a C code string identifying an expression
+            sub: the sub argument to this op's c_code
+
+            returns C code to reduce left and right, assigning the
+            result to left."""
+
+        x ,= node.inputs
+
+        dtype = x.dtype
+
+
+        dummy_left = scal.Scalar(dtype = dtype)()
+        dummy_right = scal.Scalar(dtype = dtype)()
+
+        dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
+
+        dummy_name = name + '_scalar_op'+ str(self._n_scalar_op_calls)
+        self._n_scalar_op_calls += 1
+
+        return self.scalar_op.c_code(node, name, (left, right), (left, ), sub)
+
+    def _k_reduce_buf(self, z_pos, node, name, sub):
+        """
+        WRITEME
+
+        node, name, sub: these should be passed through from the original
+        call to c_code
+        """
+
+        # This code (the code in new_version) is currently ignored.
+        # Code produced later in this function is returned instead.
+        # The code here works with all nvidia driver
+        # But only for powers or multiples of 2!
        new_version = """
        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
+        buf[threadNum] = myresult;
        __syncthreads();


        if (threadNum >= ((threadCount >> 1) * 2))
        {
-            int idx = threadNum - (threadCount >> 1) * 2;
-            buf[idx] += buf[threadNum];
-//            buf[0] = 998;
-        } else {
-//            buf[threadNum] = 0;-999;
+            int idx = threadNum - (threadCount >> 1) * 2;"""
+
+        new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
+
+        new_version += """
        }
        __syncthreads();

-        //Work for power of 2 only.
+        // Works for power of 2 only.
        int nTotalThreads = threadCount; // Total number of active threads
        while(nTotalThreads > 1)
        {
@@ -807,8 +935,11 @@ class GpuSum(GpuOp):
            {
              // Get the shared value stored by another thread
              float temp = buf[threadNum + halfPoint];
+              """
+
+        new_version += self._assign_reduce(node, name, 'buf[threadNum]', 'temp', sub)

-              buf[threadNum] += temp;
+        new_version += """
            }
            __syncthreads();

@@ -820,12 +951,13 @@ class GpuSum(GpuOp):
        {
            %(z_pos)s = buf[0];
        }
-            __syncthreads();
+            __syncthreads();"""

-""" % locals()
-        return """
+        new_version = new_version % locals()
+
+        current_version = """
        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
+        buf[threadNum] = myresult;
        __syncthreads();

        // rest of function is handled by one warp
@@ -834,19 +966,19 @@ class GpuSum(GpuOp):
            //round up all the partial sums into the first `warpSize` elements
            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
            {
-                mysum += buf[i];
+                """
+        current_version += self._assign_reduce(node, name, 'myresult', 'buf[i]', sub) + """
            }
-            buf[threadNum] = mysum;
-/*Comment this optimization as it don't work on Fermi GPU.
-  TODO: find why it don't work or put the GPU compute capability into the version
+            buf[threadNum] = myresult;
+        /*Comment this optimization as it don't work on Fermi GPU.
+        TODO: find why it don't work or put the GPU compute capability into the version
            // no sync because only one warp is running
            if(threadCount >32)
-            {
-                buf[threadNum] += buf[threadNum+16];
-                buf[threadNum] += buf[threadNum+8];
-                buf[threadNum] += buf[threadNum+4];
-                buf[threadNum] += buf[threadNum+2];
-                buf[threadNum] += buf[threadNum+1];
+            {"""
+        for num in [16,8,4,2,1]:
+            current_version += self._assign_reduce(node, name, 'buf[threadNum]',
+                    'buf[threadNum+%d]' % num, sub)
+        current_version += """
                if (threadNum == 0)
                {
                    %(z_pos)s = buf[0];
@@ -856,26 +988,32 @@ class GpuSum(GpuOp):
            else */
            if (threadNum < 16)
            {
-                //reduce so that threadNum 0 has the sum of everything
-                if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                //reduce so that threadNum 0 has the reduction of everything
+                """
+        for num in [16,8,4,2,1]:
+            this_if = "if (threadNum + %d < threadCount) " % num + \
+                self._assign_reduce(node, name, 'buf[threadNum]','buf[threadNum+%d]' % num, sub)
+            current_version += this_if
+        current_version += """
                if (threadNum == 0)
                {
                    %(z_pos)s = buf[0];
                }
            }
        }
-        """ % locals()
+        """
+
+        current_version = current_version % locals()
+
+        return current_version

    #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
    #nb_reduce<=warpSize
    def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
+        self._op_guard()
        return """
        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
+        buf[threadNum] = myresult;
        __syncthreads();

        // rest of function is handled by one warp
@@ -884,13 +1022,20 @@ class GpuSum(GpuOp):
            //round up all the partial sums into the first `nb_reduce` elements
            for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
            {
-                mysum += buf[i];
+                myresult += buf[i];
            }
-            %(z_pos)s = mysum;
+            %(z_pos)s = myresult;
        }
        """ % locals()

    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
+        """
+        WRITEME
+        IG: I believe, based on how this is called in c_code, that it
+        is for the case where we are reducing on all axes and x is
+        C contiguous.
+        """
+        self._op_guard()
        print >> sio, """
        {
          if(CudaNdarray_SIZE(%(x)s)==0){
@@ -901,11 +1046,11 @@ class GpuSum(GpuOp):
                    std::min(CudaNdarray_SIZE(%(x)s),
                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
            dim3 n_blocks(1);
-            if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s"
+            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
                                n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
            int n_shared = sizeof(float) * n_threads.x;
-            kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    CudaNdarray_SIZE(%(x)s),
                    CudaNdarray_DEV_DATA(%(x)s),
                    CudaNdarray_DEV_DATA(%(z)s));
@@ -916,7 +1061,7 @@ class GpuSum(GpuOp):
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %%s: %%s."
                             " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_ccontig_%(name)s",
+                    "kernel_reduce_ccontig_%(name)s",
                    cudaGetErrorString(sts),
                    n_blocks.x,
                    n_blocks.y,
@@ -930,6 +1075,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_1(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -943,6 +1089,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_11(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -965,6 +1112,7 @@ class GpuSum(GpuOp):
        :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
                  Work for N=1,2,3
        """
+
        assert N in [1, 2, 3]
        makecall = self._makecall(node, name, x, z, fail)
        N_pattern = ''.join(['1'] * N)
@@ -972,6 +1120,7 @@ class GpuSum(GpuOp):
                              for i in xrange(N + 1)])
        strides_dim = ",".join(["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]"
                                % locals() for i in xrange(N + 1)])
+
        threads_y = """
            //get as many y threads as we can fit
            while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
@@ -980,8 +1129,8 @@ class GpuSum(GpuOp):
                    n_threads.y += 1;
                else
                    break;
-            }
-""" % locals()
+            }""" % locals()
+
        threads_z = """
            //get as many z threads as we can fit
            while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
@@ -990,13 +1139,15 @@ class GpuSum(GpuOp):
                    n_threads.z += 1;
                else
                    break;
-            }
-""" % locals()
+            }""" % locals()
+
        if len(self.reduce_mask) == 2:
            threads_y = ''
            threads_z = ''
+
        if len(self.reduce_mask) == 3:
            threads_z = ''
+
        print >> sio, """
        {
            int verbose = 0;
@@ -1021,6 +1172,7 @@ class GpuSum(GpuOp):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)

    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        self._op_guard()
        print >> sio, """
        {
            int verbose = 0;
@@ -1032,13 +1184,13 @@ class GpuSum(GpuOp):
                    NUM_VECTOR_OP_BLOCKS));
            if (verbose) {
              fprintf(stderr,
-                "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
+                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
                n_blocks.x,
                n_blocks.y);
            }
            assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
            int n_shared = sizeof(float) * n_threads.x;
-            kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    1,
                    CudaNdarray_HOST_DIMS(%(x)s)[0],
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
@@ -1057,7 +1209,7 @@ class GpuSum(GpuOp):
                PyErr_Format(PyExc_RuntimeError,
                    "Cuda error: %%s: %%s."
                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_010_%(name)s",
+                    "kernel_reduce_010_%(name)s",
                    cudaGetErrorString(sts),
                    n_blocks.x,
                    n_blocks.y,
@@ -1070,6 +1222,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_010(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        makecall_inner = self._makecall(node, name, x, z, fail,
                                        pattern="010_inner")
@@ -1108,7 +1261,7 @@ class GpuSum(GpuOp):
                if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
                if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
                int n_shared = 0;
-                kernel_reduce_sum_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                A,B,C,D,
                        CudaNdarray_DEV_DATA(%(x)s),
                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
@@ -1125,7 +1278,7 @@ class GpuSum(GpuOp):
                    PyErr_Format(PyExc_RuntimeError,
                        "Cuda error: %%s: %%s."
                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_sum_010_%(name)s",
+                        "kernel_reduce_010_%(name)s",
                        cudaGetErrorString(sts),
                        n_blocks.x,
                        n_blocks.y,
@@ -1178,7 +1331,7 @@ class GpuSum(GpuOp):
                if (cudaSuccess != sts)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_sum_%(pattern)s_%(name)s",
+                        "kernel_reduce_%(pattern)s_%(name)s",
                        cudaGetErrorString(sts),
                        n_blocks.x,
                        n_blocks.y,
@@ -1192,6 +1345,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1211,6 +1365,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_100(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        # use threadIdx.x for i0
        # use blockIdx.x for i1
@@ -1231,6 +1386,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_110(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1252,6 +1408,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_001(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1274,6 +1431,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_111(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1306,6 +1464,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1336,6 +1495,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1368,6 +1528,7 @@ class GpuSum(GpuOp):
        """ % locals()

    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        self._op_guard()
        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
@@ -1391,18 +1552,36 @@ class GpuSum(GpuOp):
        }
        """ % locals()

-    def c_code_cache_version(self):
-        return (22,)
+    def c_code_cache_version_apply(self, node):
+        version = [5]  # the version corresponding to the c code in this Op
+
+        # now we insert versions for the ops on which we depend...
+        scalar_node = Apply(self.scalar_op,
+                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
+        version.extend(self.scalar_op.c_code_cache_version(scalar_node))
+        for i in node.inputs + node.outputs:
+            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+        if all(version):
+            return tuple(version)
+        else:
+            return ()
+
+    def _op_guard(self):
+        """ Raises NotImplementedError if op is not Add """
+        if not isinstance(self.scalar_op, theano.scalar.basic.Add):
+            raise NotImplementedError()

    def c_support_code_apply(self, node, nodename):
        sio = StringIO.StringIO()
        nd_in = len(self.reduce_mask)
        if all(i == 1 for i in self.reduce_mask):
+            self._op_guard()
            #this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_ccontig_%(nodename)s(
+            static __global__ void kernel_reduce_ccontig_%(nodename)s(
                    const unsigned int d0,
                    const float *A,
                    float * Z)
@@ -1410,7 +1589,7 @@ class GpuSum(GpuOp):
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1419,17 +1598,18 @@ class GpuSum(GpuOp):

                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
-                    mysum += A[i0];
+                    myresult += A[i0];
                }
                %(reducebuf)s
            }
            """ % locals()
        if self.reduce_mask == (1,):
+            self._op_guard()
            #this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_1_%(nodename)s(
+            static __global__ void kernel_reduce_1_%(nodename)s(
                    const unsigned int d0,
                    const float *A, const int sA0,
                    float * Z)
@@ -1437,7 +1617,7 @@ class GpuSum(GpuOp):
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1447,17 +1627,18 @@ class GpuSum(GpuOp):
                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
                    float Ai = A[i0 * sA0];
-                    mysum += Ai;
+                    myresult += Ai;
                }
                %(reducebuf)s
            }
            """ % locals()
        if self.reduce_mask == (1, 1):
+            self._op_guard()
            #this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_11_%(nodename)s(
+            static __global__ void kernel_reduce_11_%(nodename)s(
                    const int d0,
                    const int d1,
                    const float *A, const int sA0, const int sA1,
@@ -1466,7 +1647,7 @@ class GpuSum(GpuOp):
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1478,7 +1659,7 @@ class GpuSum(GpuOp):
                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                    {
                        float Ai = A[i0 * sA0 + i1 * sA1];
-                        mysum += Ai;
+                        myresult += Ai;
                    }
                }
                %(reducebuf)s
@@ -1492,53 +1673,117 @@ class GpuSum(GpuOp):
            # threads per block for each element per row.

            N_pattern = ''.join(['1'] * (nd_in - 1))
+            # TODO: is it faster to hardcode sA3, etc. in the later code, rather
+            # than have the for_* variables declare them and the later code use
+            # their names?
            if nd_in == 2:
                for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
+                first_i1 = 'threadIdx.x'
+                sA1 = 'sA1'
                for_i2 = "int i2=0, sA2=0;"
+                sA2 = '0'
+                first_i2 = '0'
                for_i3 = "int i3=0, sA3=0;"
+                sA3 = '0'
+                first_i3 = '0'
            if nd_in == 3:
                for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
+                first_i1 = 'threadIdx.y'
+                sA1 = 'sA1'
                for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
+                first_i2 = 'threadIdx.x'
+                sA2 = 'sA2'
                for_i3 = "int i3=0, sA3=0;"
+                first_i3 = 0
+                sA3 = '0'
            if nd_in == 4:
                for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
+                first_i1 = 'threadIdx.z'
+                sA1 = 'sA1'
                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
+                first_i2 = 'threadIdx.y'
+                sA2 = 'sA2'
                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
+                first_i3 = 'threadIdx.x'
+                sA3 = 'sA3'

-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, nodename, sub = {})
            param_dim = ",".join(["const int d%(i)s" % locals()
                                  for i in xrange(nd_in)])
            param_strides = ",".join(["const int sA%(i)s" % locals()
                                      for i in xrange(nd_in)])
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
-            print >> sio, """
-            %(decl)s{
-                %(init)s
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                  mysum = 0;
-                  %(for_i1)s{
-                    %(for_i2)s{
-                      %(for_i3)s{
-                        float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
-                        mysum += Ai;
+            # TODO: ideally this would all be some clean function of scalar_op,
+            # but since sum is a special case where it's OK to reduce with an
+            # extra 0, I would need to change the behavior of the sum reduction
+            # code to do that. I don't want to benchmark and test changes to the
+            # sum code so I will leave that for later.
+            # max reduction is also a special case that is simple to implement.
+            # this is the special case where reduction is idempotent so it doesn't
+            # matter if we reduce with the first element multiple times.
+            if isinstance(self.scalar_op, scal.Add):
+                # special cased sum code (special case because starts the
+                # reduction with 0)
+                print >> sio, """
+                %(decl)s{
+                    %(init)s
+                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+                      myresult = 0;
+                      %(for_i1)s{
+                        %(for_i2)s{
+                          %(for_i3)s{
+                            float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
+                            myresult += Ai;
+                          }
+                        }
                      }
+                      %(reducebuf)s
                    }
-                  }
-                  %(reducebuf)s
                }
-            }
-            """ % locals()
+                """ % locals()
+            elif isinstance(self.scalar_op, scal.Maximum):
+                # special cased max code (special case because visits first
+                # member of each row twice)
+                print >> sio, """
+                %(decl)s{
+                    %(init)s
+                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+                      myresult = A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0];
+                      %(for_i1)s{
+                        %(for_i2)s{
+                          %(for_i3)s{
+                            float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
+                            myresult = max(myresult, Ai);
+                          }
+                        }
+                      }
+                      %(reducebuf)s
+                    }
+                }
+                """ % locals()
+            else:
+                # TODO: implement general case and get rid of the two special
+                # cases above
+                # it should initialize myresult to element 0,
+                # and the for loop should begin traversing from element 1
+                # raise an error if asked to reduce an empty dimension
+                # (maybe special-case sum to return 0 instead of returning an
+                # error)
+                # in both cases, benchmark the general case against the existing
+                # code to make sure it does not cause a slowdown
+                raise NotImplementedError()
        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
+            self._op_guard()
            # this kernel uses one block for each column,
            # threads per block for each element per column.

            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_010_%(nodename)s(
+            static __global__ void kernel_reduce_010_%(nodename)s(
                    const int d0,
                    const int d1,
                    const int d2,
@@ -1560,10 +1805,10 @@ class GpuSum(GpuOp):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float mysum = 0.0f;
+                        float myresult = 0.0f;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
                        %(reducebuf)s
                    }
@@ -1572,8 +1817,9 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (0, 1, 0):
+            self._op_guard()
            print >> sio, """
-            static __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
+            static __global__ void kernel_reduce_010_AD_%(nodename)s(
                    const int A,
                    const int B,
                    const int C,
@@ -1585,7 +1831,7 @@ class GpuSum(GpuOp):
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1599,12 +1845,12 @@ class GpuSum(GpuOp):
                        int c = i2_D * 32 + threadIdx.x;
                        if (c < C)
                        {
-                            mysum = 0;
+                            myresult = 0;
                            for (int b = 0; b < B; ++b)
                            {
-                                mysum += X[a * sX0 + b * sX1 + c * sX2];
+                                myresult += X[a * sX0 + b * sX1 + c * sX2];
                            }
-                            Z[a * sZ0 + c * sZ1] = mysum;
+                            Z[a * sZ0 + c * sZ1] = myresult;
                        }
                    }
                }
@@ -1612,6 +1858,7 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (0, 1, 0):
+            self._op_guard()
            #
            # This kernel is optimized when the inner most dimensions
            # have the smallest stride.
@@ -1645,7 +1892,7 @@ class GpuSum(GpuOp):
                 {
                  for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                  {
-                      mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                      myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                  }
                  %(reducebuf)s
                 }
@@ -1653,15 +1900,16 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (1, 1, 0):
+            self._op_guard()
            # this kernel uses one block for each column,
            # threads per block for each element per column.

            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_110_%(nodename)s(
+            static __global__ void kernel_reduce_110_%(nodename)s(
                    const int d0,
                    const int d1,
                    const int d2,
@@ -1672,7 +1920,7 @@ class GpuSum(GpuOp):
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1686,7 +1934,7 @@ class GpuSum(GpuOp):
                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                    {
                        float Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2];
-                        mysum += Ai;
+                        myresult += Ai;
                    }
                }

@@ -1694,7 +1942,9 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (1, 0, 0):
-            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
+            self._op_guard()
+            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
+                    node, nodename, sub={})
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            print >> sio, """
@@ -1705,10 +1955,10 @@ class GpuSum(GpuOp):
                {
                    for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
                    {
-                        mysum = 0;
+                        myresult = 0;
                        for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
                        %(reducebuf)s
                    }
@@ -1716,21 +1966,23 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]')
+            self._op_guard()
+            reducebuf = self._k_reduce_buf('Z[0]', node,
+                    nodename, sub={})
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            print >> sio, """
            %(decl)s
            {
                %(init)s
-                mysum = 0;
+                myresult = 0;
                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
                {
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
                    }
                }
@@ -1738,11 +1990,13 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (0, 0, 1):
+            self._op_guard()
            # this kernel uses one block for each row,
            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
+                    node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_001_%(nodename)s(
+            static __global__ void kernel_reduce_001_%(nodename)s(
                    const int d0,
                    const int d1,
                    const int d2,
@@ -1763,10 +2017,10 @@ class GpuSum(GpuOp):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float mysum = 0.0f;
+                        float myresult = 0.0f;
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
                        %(reducebuf)s
                    }
@@ -1774,9 +2028,11 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (0, 0, 1, 1):
+            self._op_guard()
            # this kernel uses one block for each row,
            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
+                    node, nodename, sub = {})
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            print >> sio, """
@@ -1788,12 +2044,12 @@ class GpuSum(GpuOp):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float mysum = 0.0f;
+                        float myresult = 0.0f;
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                        }
                    }
                        %(reducebuf)s
@@ -1802,9 +2058,11 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (0, 1, 0, 1):
+            self._op_guard()
            # this kernel uses one block for each row,
            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
+                    node, nodename, sub = {})
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            print >> sio, """
@@ -1816,12 +2074,12 @@ class GpuSum(GpuOp):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float mysum = 0.0f;
+                        float myresult = 0.0f;
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                        }
                    }
                        %(reducebuf)s
@@ -1830,14 +2088,16 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (1, 1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]')
+            self._op_guard()
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
+                    sub = {})
            decl = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
            print >> sio, """
            %(decl)s
            {
                %(init)s
-                mysum = 0;
+                myresult = 0;
              for (int i0 = 0; i0 < d0; i0++)
                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
                {
@@ -1845,7 +2105,7 @@ class GpuSum(GpuOp):
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
+                            myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
                        }
                    }
                }
@@ -1853,9 +2113,11 @@ class GpuSum(GpuOp):
            }
            """ % locals()
        if self.reduce_mask == (1, 0, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
+            self._op_guard()
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
+                    node, nodename, sub = {})
            print >> sio, """
-            static __global__ void kernel_reduce_sum_1011_%(nodename)s(
+            static __global__ void kernel_reduce_1011_%(nodename)s(
                    const unsigned int d0,
                    const unsigned int d1,
                    const unsigned int d2,
@@ -1867,7 +2129,7 @@ class GpuSum(GpuOp):
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ float buf[];
-                float mysum = 0.0f;
+                float myresult = 0.0f;

                if (warpSize != 32)
                {
@@ -1881,7 +2143,7 @@ class GpuSum(GpuOp):
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
                        {
                            float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
-                            mysum += Ai;
+                            myresult += Ai;
                        }
                    }
                }

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -582,9 +582,12 @@ def local_gpu_gemm(node):

 @register_opt()
 @local_optimizer([])
-def local_gpu_sum(node):
+def local_gpu_careduce(node):
    if isinstance(node.op, tensor.elemwise.CAReduce):
-        if node.op.scalar_op == scal.add:
+        scalar_op = node.op.scalar_op
+        # currently, only these two ops are supported at all,
+        # and max does not support all combinations of axes
+        if node.op.scalar_op in [scal.add, scal.maximum]:
            x, = node.inputs
            if x.owner and x.owner.op == host_from_gpu:
                if node.op.axis is None:
@@ -594,22 +597,21 @@ def local_gpu_sum(node):
                    for a in node.op.axis:
                        assert reduce_mask[a] == 0
                        reduce_mask[a] = 1
-                gsum = GpuSum(reduce_mask)
-                pattern = (''.join(str(i) for i in reduce_mask))
-                if hasattr(gsum, 'c_code_reduce_%s' % pattern):
-                    rval = host_from_gpu(gsum(gpu_from_host(x)))
+                greduce = GpuCAReduce(reduce_mask, scalar_op)
+                if greduce.supports_c_code([gpu_from_host(x)]):
+                    rval = host_from_gpu(greduce(gpu_from_host(x)))
                    if rval.type == node.outputs[0].type:
                        return [rval]
                    else:
                        print >> sys.stderr, \
-                                "WARNING: local_gpu_sum got type wrong"
+                                "WARNING: local_gpu_careduce got type wrong"
                        return None
                else:

                    # Try to make a simpler pattern based on reshaping
                    # The principle is that if two adjacent dimensions have
                    # the same value in the reduce_mask, then we can reshape
-                    # to make them a single dimension, do the sum, and then
+                    # to make them a single dimension, do the reduction, and then
                    # reshape to get them back.

                    shape_of = node.fgraph.shape_feature.shape_of
@@ -625,27 +627,28 @@ def local_gpu_sum(node):
                            new_mask.append(reduce_mask[i])
                            new_in_shp.append(x_shape[i])

-                    pattern = (''.join(str(i) for i in new_mask))
-                    new_gsum = GpuSum(new_mask)
-                    if hasattr(new_gsum, 'c_code_reduce_%s' % pattern):
-                        reshaped_x = x.reshape(tensor.stack(*new_in_shp))
-                        sum_reshaped_x = host_from_gpu(
-                            new_gsum(gpu_from_host(reshaped_x)))
+                    new_greduce = GpuCAReduce(new_mask, scalar_op)
+                    reshaped_x = x.reshape(tensor.stack(*new_in_shp))
+                    gpu_reshaped_x = gpu_from_host(reshaped_x)
+                    reshaped_gpu_inputs = [ gpu_reshaped_x ]
+                    if new_greduce.supports_c_code(reshaped_gpu_inputs):
+                        reduce_reshaped_x = host_from_gpu(
+                            new_greduce(gpu_reshaped_x))

-                        if sum_reshaped_x.ndim != node.outputs[0].ndim:
-                            unreshaped_sum = sum_reshaped_x.reshape(
+                        if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                            unreshaped_reduce = reduce_reshaped_x.reshape(
                                tensor.stack(*shape_of[node.outputs[0]]))
                        else:
-                            unreshaped_sum = sum_reshaped_x
-                        if unreshaped_sum.type == node.outputs[0].type:
-                            return [unreshaped_sum]
+                            unreshaped_reduce = reduce_reshaped_x
+                        if unreshaped_reduce.type == node.outputs[0].type:
+                            return [unreshaped_reduce]
                        else:
                            print >> sys.stderr, \
-                                    "WARNING: local_gpu_sum got type wrong"
+                                    "WARNING: local_gpu_careduce got type wrong"
                            return None

                        raise Exception(
-                            "GpuSum don't have implemented the pattern",
+                                "GpuCAReduce does not yet implement this pattern:",
                            pattern)
    return False


--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -9,6 +9,7 @@ from theano import tensor
 import numpy
 import theano
 import theano.tensor as T
+from numpy.testing.noseclasses import KnownFailureTest

 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
@@ -108,7 +109,7 @@ def test_sum():
        val = theano._asarray(val, dtype='float32')
        f = theano.function([a], b, mode=mode_with_gpu)
        f2 = theano.function([a], b, mode=mode_without_gpu)
-        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
        if val.size == 0:
            assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
@@ -145,7 +146,7 @@ def test_sum():
        val = theano._asarray(val, dtype='float32')
        f = theano.function([a], b, mode=mode_with_gpu)
        f2 = theano.function([a], b, mode=mode_without_gpu)
-        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
        assert _allclose(f2(val), f(val)), ('shape', shape,
                                            'pattern', pattern,
@@ -181,12 +182,244 @@ def test_sum():
            val2 = val2[::2, ::2, ::2, ::2]
        f = theano.function([a], b, mode=mode_without_gpu)
        f2 = theano.function([a2], b2, mode=mode_with_gpu)
-        assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
        assert T.Sum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
        assert _allclose(f2(val2), f(val)), ('shape', shape,
                                             'pattern', pattern,
                                             sum([shape[i] for i in pattern]))

+def test_max():
+    """
+    test GpuMax pattern 01, 011, 0111 (tensor.max pattern (1,), (1,2), (1,2,3) )
+
+    TODO: are others currently implemented by reshape?
+    """
+
+    def tensor_pattern_to_gpu_pattern(shape, pattern):
+        gpu_pattern = [ 0 for elem in shape ]
+        for idx in pattern:
+            gpu_pattern[idx] = 1
+        gpu_pattern = tuple(gpu_pattern)
+        return gpu_pattern
+
+    known_fail = False
+
+    for shape, pattern in [((1,1),(1,)),
+                           ((1,0),(1,)),
+                           ((0,1),(1,)),
+                           ((0,0),(1,)),
+                           ((0,0,0),(1,2)),
+                           ((0,0,0,0),(1,2,3)),
+                           ((2,1),(1,)),
+                           ((1,2),(1,)),
+                           ((100,3,1300),[1]),
+                           ((0,),[0]),((5,),[0]),
+                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
+                           ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
+                           ((0,0,0,0),[0,1,2,3]),
+                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                           ((5,4,3,10,11),[1,2]),
+                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+
+                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
+                           ((4100,3),[0]),((3,4101),[0]),#10
+                           ((1024,33),[0]),((33,1024),[0]),#10
+                           ((1025,33),[0]),((33,1025),[0]),#10
+
+                           ((4100,3),[1]),((3,4101),[1]),#01
+                           ((1024,33),[1]),((33,1024),[1]),#01
+                           ((1025,33),[1]),((33,1025),[1]),#01
+
+                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
+                           ((1024,33),[0,1]),((33,1024),[0,1]),#01
+                           ((1025,33),[0,1]),((33,1025),[0,1]),#01
+
+                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
+                           ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
+                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+
+                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+                           ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
+                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
+
+
+                           #test pattern implemented by reshape
+                           ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+                           ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+                           ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+                           ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+
+                           ]:
+        # Don't test patterns that aren't implemented for max yet
+        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
+            [ (0,1), (0,1,1), (0,1,1) ]:
+            continue
+
+        a = tensor.TensorType('float32', (False,) * len(shape))()
+        b = T.max(a, pattern)
+        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+#        val = numpy.ones(shape)
+#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+        val = theano._asarray(val, dtype='float32')
+        f = theano.function([a], b, mode=mode_with_gpu)
+        f_caused_value_error = False
+        try:
+            f_out = f(val)
+        except ValueError, e:
+            exc = e
+            f_caused_value_error = True
+        except RuntimeError:
+            if (shape, pattern) in [((1,0),(1,)),
+                                  ((0,1),(1,)),
+                                  ((0,0),(1,)),
+                                  ((0,0,0),(1,2)),
+                                  ((0,0,0,0),(1,2,3))]:
+                known_fail = True
+                continue
+            else:
+                raise
+
+        f2 = theano.function([a], b, mode=mode_without_gpu)
+        try:
+            f2_out = f2(val)
+            f2_caused_value_error = False
+        except ValueError, e:
+            exc2 = e
+            f2_caused_value_error = True
+
+
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
+        assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
+
+        # Check that 0 shape matrices are invalid in the same cases
+        if f_caused_value_error != f2_caused_value_error:
+            if f_caused_value_error:
+                print 'f caused this value error:'
+                print exc
+            else:
+                print 'f did not raise a value error, but should have'
+            if f2_caused_value_error:
+                print 'f2 caused this value error:'
+                print exc2
+            else:
+                print 'f should not have raised a value error'
+            print 'shape was: ',shape
+            print 'pattern was: ',pattern
+            assert False
+
+        if f_caused_value_error:
+            continue
+
+        if val.size == 0:
+            assert f2(val).size == f(val).size
+            assert f2(val).shape == f(val).shape
+        else:
+            try:
+                #We raise the error threashold as we sum big matrix
+                #and this cause small rounding difference with some seed
+                #example in debug mode with unittests.rseed=9275
+                orig_rtol = theano.tensor.basic.float32_rtol
+                theano.tensor.basic.float32_rtol = 2e-5
+                f2_val = f2(val)
+                f_val = f(val)
+                if not _allclose(f2_val, f_val):
+                    print 'failed for the following arguments: '
+                    print 'shape:',shape
+                    print 'pattern: ',pattern
+                    print 'input:'
+                    print val
+                    print 'correct output: '
+                    print f2_val
+                    print 'actual output: '
+                    print f_val
+                    assert False
+            finally:
+                theano.tensor.basic.float32_rtol = orig_rtol
+
+
+        #test with dimshuffle
+        #we shuffle the 2 outer dims.
+    for shape, pattern in [#((5,),[0]),
+                           ((5,4),(0,1)),((5,4),[0]),
+                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
+                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
+        # Don't test patterns that aren't implemented for max yet
+        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
+            [ (0,1), (0,1,1), (0,1,1) ]:
+            continue
+        a = tensor.TensorType('float32', (False,) * len(shape))()
+        dim_pattern = range(len(shape))
+        dim_pattern[0] = 1
+        dim_pattern[1] = 0
+        a = a.dimshuffle(dim_pattern)
+        b = T.max(a, pattern)
+        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+#        val = numpy.ones(shape)
+#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+        val = theano._asarray(val, dtype='float32')
+        f = theano.function([a], b, mode=mode_with_gpu)
+        f2 = theano.function([a], b, mode=mode_without_gpu)
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
+        assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
+        assert _allclose(f2(val), f(val)), ('shape', shape,
+                                            'pattern', pattern,
+                                            sum([shape[i] for i in pattern]))
+
+
+        #test with broadcast
+    for shape, pattern in [((5,),(0,)),
+                           ((5,4),(0,1)),
+                           ((5,4),(0,)),
+                           ((5,4,3),(0,)),
+                           ((5,4,3),(0,1)),
+                           ((5,4,3),(2,)),
+                           ((5,4,3),(0,1,2)),
+                           ((5,4,3,2),(0,1,2,3)),
+                           ((5,4,3,2),(0,2,3))]:
+        # Don't test patterns that aren't implemented for max yet
+        if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
+            [ (0,1), (0,1,1), (0,1,1) ]:
+            continue
+        shape = numpy.asarray(shape) * 2
+        a = tensor.TensorType('float32', (False,) * len(shape))()
+        a2 = tcn.CudaNdarrayType((False,) * len(shape))()
+        b = T.max(a, pattern)
+        b2 = T.max(a2, pattern)
+        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
+#        val = numpy.ones(shape)
+#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
+        val = theano._asarray(val, dtype='float32')
+        val2 = cuda.CudaNdarray(val)
+        if len(shape) == 1:
+            val = val[::2]
+            val2 = val2[::2]
+        elif len(shape) == 2:
+            val = val[::2, ::2]
+            val2 = val2[::2, ::2]
+        elif len(shape) == 3:
+            val = val[::2, ::2, ::2]
+            val2 = val2[::2, ::2, ::2]
+        elif len(shape) == 4:
+            val = val[::2, ::2, ::2, ::2]
+            val2 = val2[::2, ::2, ::2, ::2]
+        f = theano.function([a], b, mode=mode_without_gpu)
+        f2 = theano.function([a2], b2, mode=mode_with_gpu)
+        assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
+        assert T.CAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
+        assert _allclose(f2(val2), f(val)), ('shape', shape,
+                                             'pattern', pattern,
+                                             sum([shape[i] for i in pattern]))
+
+    if known_fail:
+        raise KnownFailureTest("GpuCAReduce does not handle some shapes"
+                " with 0s in them correctly.")

 def test_flatten():
    x = cuda.fmatrix('x')

--- a/theano/sandbox/cuda/tests/test_driver.py
+++ b/theano/sandbox/cuda/tests/test_driver.py
@@ -28,7 +28,10 @@ def test_nvidia_driver1():
                        profile=False)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
-    assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
+    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
+        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
+            'but got:']+[str(app) for app in topo])
+        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "
                        "does not give good results for reduction."

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -44,11 +44,11 @@ def test_int_pow():
    f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)

    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu']
+    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']

    f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
-    assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu']
+    assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']

    #theano.printing.debugprint(f)


--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp):
        gx = eq(output, x) * gz
        gy = eq(output, y) * gz
        return (gx, gy)
+
 maximum = Maximum(upcast_out, name='maximum')


@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp):
        gx = eq(output, x) * gz
        gy = eq(output, y) * gz
        return (gx, gy)
-
 minimum = Minimum(upcast_out, name='minimum')


@@ -1222,6 +1222,8 @@ class Add(ScalarOp):
            for i in inputs:
                    retval += [gz]
        return retval
+
+
 add = Add(upcast_out, name='add')



--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1082,14 +1082,16 @@ class Elemwise(Op):

 class CAReduce(Op):
    """
+    CAReduce = Commutative Associative Reduce
    Reduces a scalar operation along the specified axis(es).
+    (The scalar op should be both commutative and assocative)

    The output will have the same shape as the input minus the reduced
    dimensions. It will contain the variable of accumulating all values
    over the reduced dimensions using the specified scalar op.

    Examples:
-     CAReduce(add) -> sum
+     CAReduce(add) -> sum (ie, acts like the numpy sum operation)
     CAReduce(mul) -> product
     CAReduce(maximum) -> max
     CAReduce(minimum) -> min