Merge pull request #3743 from nouiz/reduce_101_newbackend

Reduce 101 newbackend

Merge pull request #3743 from nouiz/reduce_101_newbackend
6ec28cf2 · abergeron · 9f696804 · 165b6c23 · 6ec28cf2 · 6ec28cf2
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -714,7 +714,7 @@ class PureOp(object):
            "own op, implement the R_op method." %
            (self, self.__class__.__name__))

-    def perform(self, node, inputs, output_storage):
+    def perform(self, node, inputs, output_storage, params=None):
        """
        Required: Calculate the function on the inputs and put the variables in
        the output storage. Return None.
@@ -746,7 +746,10 @@ class PureOp(object):
            The subclass does not override this method.

        """
-        raise utils.MethodNotDefined("perform", type(self), self.__class__.__name__)
+        raise utils.MethodNotDefined(
+            "perform", type(self), self.__class__.__name__,
+            "Did you used Theano flags mode=FAST_COMPILE?"
+            " You can use optimizer=fast_compile instead.")

    def do_constant_folding(self, node):
        """

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -117,14 +117,6 @@ def test_careduce():
                               ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]),  # 111
                               ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]),  # 111

-                               ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]),  # 0011
-                               ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]),  # 0101
-                               ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),#1011
-                               ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]),  # 0111
-                               ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]),  # 0111
-                               ((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]),  # 1111
-
-
                               # test pattern implemented by reshape
                               ((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]),  # 1000
                               ((4100, 4, 3, 2), [1]), ((4, 4100, 3, 2), [1]), ((4, 3, 4100, 2), [1]), ((4, 3, 2, 4100), [1]),  # 0100
@@ -132,6 +124,8 @@ def test_careduce():
                               ((4100, 4, 3, 2), [3]), ((4, 4100, 3, 2), [3]), ((4, 3, 4100, 2), [3]), ((4, 3, 2, 4100), [3]),  # 0001

                               # reduce over 2d
+                               ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]),  # 0011
+                               ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]),  # 0101
                               ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]),  # 0110
                               ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001
 #                               ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
@@ -141,9 +135,12 @@ def test_careduce():
                               # 3d not tested: 1101, 1110, 1111
                               ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101
                               ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]),  # 1110
+                               ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),#1011
+                               ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]),  # 0111
+                               ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]),  # 0111

                               # reduce over 4d
-                               ((4100, 4, 3, 2), [0]), ((4, 4100, 3, 2), [0]), ((4, 3, 4100, 2), [0]), ((4, 3, 2, 4100), [0]),  # 1111
+                               ((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]),  # 1111

                               # reduce over 5d
                               ((1100, 2, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 1100, 3, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 1100, 4, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 1100, 5), [0, 1, 2, 3, 4]), ((2, 3, 4, 5, 1100), [0, 1, 2, 3, 4]),  # 11111

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -3,6 +3,7 @@ import copy
 from theano.compat import izip
 import numpy

+import theano
 from theano import Apply, scalar, config
 from theano import scalar as scal
 from six.moves import StringIO, xrange
@@ -654,7 +655,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        return node.inputs[0].type.context

    def perform(self, node, inp, out, ctx):
-        raise MethodNotDefined("")
+        theano.Op.perform(self, node, inp, out, ctx)

    def supports_c_code(self, inputs):
        """
@@ -833,7 +834,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

        return sio.getvalue()

-    def _makecall(self, node, name, x, z, fail, pattern=None):
+    def _makecall(self, node, name, x, z, fail, pattern=None, extra_dims=(), extra_strides=()):
        """
        Return a string for making a kernel call.

@@ -876,6 +877,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

        for i in xrange(ndim):
            params.append("(void *)&PyGpuArray_DIMS(%(x)s)[%(i)s]" % locals())
+        for declaration, value in extra_dims:
+            print(declaration % locals(), file=sio)
+            params.append(value)
        params.append("(void *)%(x)s->ga.data" % locals())
        params.append("(void *)&%(x)s->ga.offset" % locals())
        for i in xrange(ndim):
@@ -883,6 +887,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            ssize_t stride_A%(i)d = PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s);
            """ % locals(), file=sio)
            params.append("(void *)&stride_A%(i)d" % locals())
+        for declaration, value in extra_strides:
+            print(declaration % locals(), file=sio)
+            params.append(value)

        params.append("(void *)%(z)s->ga.data" % locals())
        params.append("(void *)&%(z)s->ga.offset" % locals())
@@ -1779,6 +1786,34 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        }
        """ % locals(), file=sio)

+    def c_code_reduce_101(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail,
+                                  extra_dims=[("size_t one = 1;", "(void *) &one")],
+                                  extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
+                                  pattern="1011")
+        print("""
+        {
+            int verbose = 0;
+//            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
+//                                            (size_t) 256), 1, 1};
+            size_t n_threads[3] = {1, 1, 1};
+
+            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
+            if (n_threads[1] > PyGpuArray_DIMS(%(x)s)[2])
+                n_threads[1] = PyGpuArray_DIMS(%(x)s)[2];
+
+            while (n_threads[0] * n_threads[1] * (n_threads[2]+1) <= 256)
+                ++n_threads[2];
+            if (n_threads[2] > 64)
+                n_threads[2] = 64;
+            if (n_threads[2] > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads[2] = PyGpuArray_DIMS(%(x)s)[0];
+
+            size_t n_blocks[3] = {PyGpuArray_DIMS(%(x)s)[1], 1, 1};
+            %(makecall)s
+        }
+        """ % locals(), file=sio)
+
    def c_code_reduce_111(self, sio, node, name, x, z, fail):
        makecall = self._makecall(node, name, x, z, fail)
        print("""
@@ -2572,7 +2607,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            """ % locals(), file=sio)
            kernels.append(Kernel(code=sio.getvalue(), name=kname,
                                  params=params, flags=flags, objvar=k_var))
-        if self.reduce_mask == (1, 0, 1, 1):
+        if self.reduce_mask == (1, 0, 1, 1) or self.reduce_mask == (1, 0, 1):
            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -83,6 +83,7 @@ class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
    op = GpuCAReduceCPY
    reds = [scalar.add, scalar.mul]
    pre_scalar_op = None
+    mode = mode_with_gpu

    def test_perform(self):
        for dtype in self.dtypes + self.bin_dtypes:
@@ -172,16 +173,28 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]),  # 001
             ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]),  # 110
             ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]),  # 011
-             # ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+             ((4100, 4, 3), [0, 2]), ((5, 4100, 3), [0, 2]), ((5, 4, 4100), [0, 2]),  # 101
             ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]),  # 111
             ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]),  # 111

+             # reduce over 2d
             ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]),  # 0011
             ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]),  # 0101
-             ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),#1011
+             # ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]),  # 0110 by reshape
+             # ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),  # 1001 by reshape
+             # ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),  # 1010 not implemented
+             # ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]),  # 1100 by reshape
+
+             # reduce over 3d
+             # 3d not tested: 1101, 1110, 1111
+             # ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),  # 1101 by reshape
+             # ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]),  # 1110 by reshape
+             ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]),  # ((4,3,2,4100),[0,2,3]),  # 1011
             ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]),  # 0111
             ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]),  # 0111
-             ((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 2, 3), [0, 1, 2, 3]),  # 1111
+
+             # reduce over 4d
+             ((4100, 2, 3, 4), [0, 1, 2, 3]), ((2, 4100, 3, 4), [0, 1, 2, 3]), ((2, 3, 4100, 4), [0, 1, 2, 3]), ((2, 3, 4, 4100), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]),  # 1111

             # test pattern implemented by reshape
             # Skip them as this test the op directly, not the optimization with reshape