Merge pull request #1952 from nouiz/gpu_sum

Move sum to the GPU more frequently, and fix crashes

Merge pull request #1952 from nouiz/gpu_sum
ab206dc1 · abergeron · 548729f1 · 18dc8c0e · ab206dc1 · ab206dc1
--- a/doc/extending/ctype.txt
+++ b/doc/extending/ctype.txt
@@ -571,7 +571,10 @@ the elements of the shape).
 .. code-block:: python
   theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
-   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, CHECK_INPUT, version=())
 The C code works as the ViewOp. Shape_i has the additional ``i`` parameter
 that you can use with ``%(i)s``.
+In your CHECK_INPUT, you must check that the input have enough ndim to
+be able to get the ith shapes.
--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -349,13 +349,13 @@ class Shape_i(gof.Op):
        version = []
        # If any of the c code is unversionned, we have to return ()
        # Else, we will return a list of (type name, version) pairs.
-        for t, (c, v) in sorted(self.c_code_and_version.items(),
+        for t, (c, ci, v) in sorted(self.c_code_and_version.items(),
-                                key=lambda pair: str(pair[0])):
+                                    key=lambda pair: str(pair[0])):
            if not v:
                warnings.warn("Type %s has C code for Shape_i, but it has "
-                        "no version. You should add a 'version' keyword arg "
+                              "no version. You should add a 'version' keyword "
-                        "when calling register_shape_i_c_code." % t,
+                              "arg when calling register_shape_i_c_code." % t,
-                        stacklevel=2)
+                              stacklevel=2)
                return ()
            version.append((str(t), v))
@@ -372,14 +372,8 @@ class Shape_i(gof.Op):
        itype = node.inputs[0].type.__class__
        if itype in self.c_code_and_version:
-            sc = """
+            code, check_input, version = self.c_code_and_version[itype]
-            if (%(i)s>=PyArray_NDIM(%(iname)s)){
+            return (check_input + code) % locals()
-                PyErr_SetString(PyExc_TypeError, "Number of dimensions lower than expected");
-                %(fail)s
-            }
-            """ % locals()
-            code, version = self.c_code_and_version[itype]
-            return sc + code % locals()
        # Else, no C code
        return super(Shape_i, self).c_code(node, name, inames, onames, sub)
@@ -391,7 +385,7 @@ class Shape_i(gof.Op):
        return [None]
-def register_shape_i_c_code(typ, code, version=()):
+def register_shape_i_c_code(typ, code, check_input, version=()):
    """ Tell Shape_i how to generate C code for a Theano Type
    :param typ: A Theano type. It must be the Theano class itself and not an
@@ -401,13 +395,14 @@ def register_shape_i_c_code(typ, code, version=()):
                 variable names respectively.
    :param version: A number indicating the version of the code, for cache.
    """
-    Shape_i.c_code_and_version[typ] = (code, version)
+    Shape_i.c_code_and_version[typ] = (code, check_input, version)
 # List of Theano Types that one can add an extra dimension and for which
 # Scan can deal with.
 expandable_types = ()
 class FromFunctionOp(gof.Op):
    """
    Build a basic Theano Op around a function.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -342,10 +342,10 @@ class ProfileStats(object):
        es += ['   %2s ']
        hs += ['<#call>']
-        es += ['  %5d  ']
+        es += ['%6d  ']
        hs += ['<#apply>']
-        es += ['  %4d  ']
+        es += [' %4d  ']
        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
        maxlen = self.line_width - upto_length
@@ -587,6 +587,7 @@ class ProfileStats(object):
                print >> file, '  Time in thunks: %es (%.3f%%)' % (
                        local_time, 100*local_time / self.fct_call_time)
        print >> file, '  Total compile time: %es' % self.compile_time
+        print >> file, '    Number of Apply nodes: %s' % len(self.apply_time)
        print >> file, '    Theano Optimizer time: %es' % self.optimizer_time
        print >> file, '       Theano validate time: %es' % self.validate_time
        print >> file, ('    Theano Linker time (includes C,'

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -783,7 +783,7 @@ def pydotprint(fct, outfile=None,
            elif var.name or not compact:
                g.add_edge(pd.Edge(astr, varstr, label=label))
 #            else:
-            #don't add egde here as it is already added from the inputs.
+            # don't add egde here as it is already added from the inputs.
    if cond_highlight:
        g.add_subgraph(c1)
@@ -863,8 +863,8 @@ def pydotprint_variables(vars,
                dstr = dstr[:dstr.index('\n')]
            varstr = '%s %s' % (dstr, str(var.type))
        else:
-            #a var id is needed as otherwise var with the same type will be
+            # a var id is needed as otherwise var with the same type will be
-            #merged in the graph.
+            # merged in the graph.
            varstr = str(var.type)
        varstr += ' ' + str(len(var_str))
@@ -1090,8 +1090,6 @@ def min_informative_str(obj, indent_level=0,
    return rval
 def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
    """
    Returns a string, with no endlines, fully specifying
@@ -1154,6 +1152,7 @@ def var_descriptor(obj, _prev_obs=None, _tag_generator=None):
    return rval
 def position_independent_str(obj):
    if isinstance(obj, theano.gof.graph.Variable):
        rval = 'theano_var'

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -86,6 +86,29 @@ register_opt()(theano.tensor.opt.local_track_shape_i)
 register_opt(name='gpu_constant_folding')(
    tensor.opt.constant_folding)
+# This is a partial list of CPU ops that can be in some circonstance
+# moved to the GPU. This list is used by an optimization.
+# Hopefully, we can keep this list up to date.
+import theano.tensor.signal.downsample
+import theano.sandbox.neighbours
+cpu_ops_moved_to_gpu = [
+    tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
+    tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
+    tensor.signal.downsample.DownsampleFactorMax,
+    tensor.signal.downsample.DownsampleFactorMaxGrad,
+    theano.sandbox.neighbours.Images2Neibs,
+    tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias,
+    tensor.nnet.CrossentropySoftmax1HotWithBiasDx,
+    tensor.nnet.Softmax, tensor.nnet.SoftmaxWithBias,
+    tensor.Elemwise, tensor.DimShuffle, tensor.CAReduce,
+    tensor.elemwise.All, tensor.elemwise.Any,
+    tensor.elemwise.CAReduceDtype, tensor.elemwise.Sum,
+    tensor.elemwise.Prod, tensor.elemwise.ProdWithoutZeros,
+    tensor.Reshape, tensor.Flatten, tensor.Subtensor,
+    tensor.AdvancedSubtensor1, tensor.AdvancedIncSubtensor1,
+    tensor.IncSubtensor, tensor.Shape, tensor.Join,
+    tensor.Alloc, tensor.Eye]
 class InputToGpuOptimizer(Optimizer):
    """
@@ -617,7 +640,33 @@ def local_gpu_careduce(node):
        if isinstance(node.op.scalar_op, (scal.Add, scal.Mul,
                                          scal.Maximum, scal.Minimum)):
            x, = node.inputs
+            replace = False
            if x.owner and isinstance(x.owner.op, HostFromGpu):
+                replace = True
+            elif (all([c != "output" and isinstance(c.op, GpuFromHost)
+                      for c, i in node.outputs[0].clients])
+                  and x.owner and x.owner.op.__class__ in
+                  cpu_ops_moved_to_gpu):
+                # It is not always good to transfer the reduction to
+                # the GPU when the clients are on the GPU but not the
+                # reduction input. It mean we will transfer the
+                # (bigger) input to the GPU instead of the
+                # output(smaller) if we stop optimization there. Most
+                # of the time, we will also move to the GPU what
+                # created the input of the reduction. In that case, we
+                # don't introduce a bigger transfer. It is hard to
+                # know if after all optimization we will do the bigger
+                # transfer or not. I'm guessing an heuristic to find
+                # that. I suppose that if the input of the recution is
+                # generated by an op that we can in some cases move to
+                # the GPU, that we will move it. If some CPU ops are
+                # supported only in some cases on the GPU, this will
+                # move to the GPU the reduction when it wasn't a good
+                # idea.
+                replace = True
+            if replace:
                if node.op.axis is None:
                    reduce_mask = [1] * x.type.ndim
                else:

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -454,12 +454,22 @@ theano.compile.register_view_op_c_code(
        """,
        version=1)
-theano.compile.register_shape_i_c_code(CudaNdarrayType, """
+theano.compile.register_shape_i_c_code(
+    CudaNdarrayType,
+    """
    if(!%(oname)s)
        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
                              CudaNdarray_HOST_DIMS(%(iname)s)[%(i)s];
-""", version=(0,))
+    """,
+    """
+    if (%(i)s>=CudaNdarray_NDIM(%(iname)s)){
+        PyErr_SetString(PyExc_TypeError,
+            "Number of dimensions lower than expected");
+        %(fail)s
+    }
+    """,
+    version=(1,))
 # Register CudaNdarrayType to the DeepCopyOp list of types with c code.
 theano.compile.register_deep_copy_op_c_code(

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -20,7 +20,7 @@ from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
-    host_from_gpu, gpu_from_host, HostFromGpu,
+    host_from_gpu, gpu_from_host, HostFromGpu, GpuSplit,
    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
 )
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
@@ -316,6 +316,7 @@ def local_gpuajoin_1(node):
        len(node.inputs) == 2):
        return [node.inputs[1]]
 @register_opt()
 @op_lifter([tensor.Split])
 def local_gpua_split(node):
@@ -334,7 +335,7 @@ def local_gpua_incsubtensor(node):
    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
                           node.op.set_instead_of_inc,
                           node.op.destroyhandler_tolerate_aliased)
 @register_opt()
 @op_lifter([tensor.AdvancedIncSubtensor1])
@@ -371,8 +372,8 @@ def local_gpua_careduce(node):
            dtype=getattr(node.op, 'dtype', None),
            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
-        #We need to have the make node called, otherwise the mask can
+        # We need to have the make node called, otherwise the mask can
-        #be None
+        # be None
        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
            return greduce
        else:
@@ -406,7 +407,7 @@ def local_gpua_careduce(node):
            for idx, m in enumerate(new_mask):
                if m == 1:
                    new_axis.append(idx)
-            new_greduce = GpuCAReduceCuda(
+            greduce = GpuCAReduceCuda(
                node.op.scalar_op,
                axis=new_axis, reduce_mask=new_mask,
                dtype=getattr(node.op, 'dtype', None),
@@ -415,12 +416,12 @@ def local_gpua_careduce(node):
            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
            gpu_reshaped_x = gpu_from_host(reshaped_x)
            gvar = greduce(gpu_reshaped_x)
-            #We need to have the make node called, otherwise the mask can
+            # We need to have the make node called, otherwise the mask can
-            #be None
+            # be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
-            if new_greduce.supports_c_code(reshaped_gpu_inputs):
+            if greduce.supports_c_code(reshaped_gpu_inputs):
                reduce_reshaped_x = host_from_gpu(
-                    new_greduce(gpu_reshaped_x))
+                    greduce(gpu_reshaped_x))
                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
                    unreshaped_reduce = reduce_reshaped_x.reshape(
@@ -497,8 +498,8 @@ def local_gpu_conv(node):
        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
            return None
-        #print op.kshp, op.imshp[1:3]
+        # print op.kshp, op.imshp[1:3]
-        #print op.kshp_logical, logical_img_hw
+        # print op.kshp_logical, logical_img_hw
        ret = GpuConv(border_mode=op.out_mode,
                      subsample=(op.dx, op.dy),
                      logical_img_hw=logical_img_hw,
@@ -508,12 +509,12 @@ def local_gpu_conv(node):
                      version=op.version,
                      verbose=op.verbose,
                      imshp=op.imshp,
-                  )
+        )
        if op.imshp_logical is not None:
            logical_img_hw = op.imshp_logical[1:3]
            if logical_img_hw != op.imshp[1:3]:
                # this case is not implemented
-                #return None
+                # return None
                rstride = int(numpy.ceil(op.imshp_logical[1] /
                                         float(op.imshp[1])))
                cstride = int(numpy.ceil(op.imshp_logical[2] /
@@ -542,7 +543,7 @@ def local_gpu_conv(node):
        assert a.ndim == 4
        atol = None
        if a.shape[-1] * a.shape[-2] > 100:
-            #For float32 the default atol is 1e-5
+            # For float32 the default atol is 1e-5
            atol = 3e-5
        return GpuArrayType.values_eq_approx(a, b, atol=atol)
@@ -557,7 +558,7 @@ def local_gpu_conv(node):
    out = tensor.patternbroadcast(
        host_from_gpu(out),
        node.outputs[0].broadcastable)
-    #op_lifter want the output on the GPU.
+    # op_lifter want the output on the GPU.
    out = gpu_from_host(out)
    out.values_eq_approx = values_eq_approx
    return [out]

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -356,6 +356,17 @@ class G_Join_and_Split(T_Join_and_Split):
        self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
        self.shared = gpuarray_shared_constructor
+    def test_gpusplit_opt(self):
+        rng = numpy.random.RandomState(seed=utt.fetch_seed())
+        m = self.shared(rng.rand(4, 6).astype(self.floatX))
+        o = T.Split(2)(m, 0, [2, 2])
+        f = theano.function([], o, mode=self.mode)
+        assert any([isinstance(node.op, self.split_op)
+                    for node in f.maker.fgraph.toposort()])
+        o1, o2 = f()
+        assert numpy.allclose(o1, m.get_value(borrow=True)[:2])
+        assert numpy.allclose(o2, m.get_value(borrow=True)[2:])
 def test_gpujoin_gpualloc():
    a = T.fmatrix('a')

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -315,12 +315,22 @@ theano.compile.register_shape_c_code(
    """,
    version=1)
-theano.compile.register_shape_i_c_code(GpuArrayType, """
+theano.compile.register_shape_i_c_code(
+    GpuArrayType,
+    """
    if(!%(oname)s)
        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
    ((npy_int64*)PyArray_DATA(%(oname)s))[0] =
                              %(iname)s->ga.dimensions[%(i)s];
-""", version=(0,))
+    """,
+    """
+    if (%(i)s>=%(iname)s->ga.nd){
+        PyErr_SetString(PyExc_TypeError,
+            "Number of dimensions lower than expected");
+        %(fail)s
+    }
+    """,
+    version=(1,))
 theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
    Py_XDECREF(%(oname)s);
@@ -331,11 +341,11 @@ theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
 theano.compile.register_rebroadcast_c_code(
    GpuArrayType,
    """
-    if(PyGpuArray_DIMS(%(iname)s)[%(axis)s] != 1){
+    if(%(iname)s->ga.dimensions[%(axis)s] != 1){
        PyErr_Format(PyExc_ValueError,
            "Dimension %(axis)s in Rebroadcast's input was"
            " supposed to be 1 (got %%d instead)",
-            PyGpuArray_DIMS(%(iname)s)[%(axis)s]);
+            %(iname)s->ga.dimensions[%(axis)s]);
        %(fail)s
    }
    """,

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -754,6 +754,9 @@ class ShapeFeature(object):
    def shape_tuple(self, r):
        """Return a tuple of symbolic shape vars for tensor variable r"""
+        if not hasattr(r, 'ndim'):
+            # This happen for NoneConst.
+            return None
        return tuple([self.shape_ir(i, r) for i in xrange(r.ndim)])
    def default_infer_shape(self, node, i_shapes):
@@ -782,7 +785,9 @@ class ShapeFeature(object):
            # don't make the optimizer merge a zillion ones together
            # by always returning the same object to represent 1
            return self.lscalar_one
-        if type(s_i) in (int, long) or isinstance(s_i, numpy.integer):
+        if (type(s_i) in (int, long) or
+            isinstance(s_i, numpy.integer) or
+            (isinstance(s_i, numpy.ndarray) and s_i.ndim == 0)):
            # this shape is a constant
            assert s_i >= 0
            return T.constant(s_i, dtype='int64')

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -3246,7 +3246,7 @@ class T_Join_and_Split(unittest.TestCase):
 #        assert tensor.grad(join(1,a,b), a
        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        eps=1.0e-4, rel_tol=1.0e-3)
+                        eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
    def test_join_matrix1_using_vertical_stack(self):
        a = self.shared(numpy.array([[1, 2, 3], [4, 5, 6]], dtype=self.floatX))
@@ -3272,7 +3272,7 @@ class T_Join_and_Split(unittest.TestCase):
        self.assertTrue((out == want).all())
        utt.verify_grad(lambda a, b: join(1, a, b), [av, bv],
-                        eps=1.0e-4, rel_tol=1.0e-3)
+                        eps=1.0e-4, rel_tol=1.0e-3, mode=self.mode)
    def test_join_matrixV(self):
        """variable join axis"""
@@ -3294,8 +3294,8 @@ class T_Join_and_Split(unittest.TestCase):
        got = f(1)
        self.assertTrue((got == want).all(), (got, want))
-        utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v])
+        utt.verify_grad(lambda a, b: join(0, a, b), [v, 2 * v], mode=self.mode)
-        utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v])
+        utt.verify_grad(lambda a, b: join(1, a, b), [v, 2 * v], mode=self.mode)
    def test_vector_len(self):
        x = lscalar('x')
@@ -3344,7 +3344,8 @@ class T_Join_and_Split(unittest.TestCase):
        assert [True for node in topo if isinstance(node.op, self.join_op)]
        f()
-        utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng)
+        utt.verify_grad((lambda a, b: join(1, a, b)), [a_val, b_val], rng=rng,
+                        mode=self.mode)
        # Should raise an error if dimension 0 does not match
        a.set_value(rng.rand(2, 4, 1).astype(self.floatX))
@@ -3370,7 +3371,8 @@ class T_Join_and_Split(unittest.TestCase):
        assert [True for node in topo if isinstance(node.op, self.join_op)]
        f()
-        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
+        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
+                        mode=self.mode)
        # Should raise an error if b_val.shape[0] is not 1
        # We can't set the value|
        self.assertRaises(TypeError, b.set_value,
@@ -3402,7 +3404,8 @@ class T_Join_and_Split(unittest.TestCase):
        assert [True for node in topo if isinstance(node.op, self.join_op)]
        f()
-        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng)
+        utt.verify_grad((lambda a, b: join(0, a, b)), [a_val, b_val], rng=rng,
+                        mode=self.mode)
    def test_broadcastable_single_input_broadcastable_dimension(self):
        # Test that all broadcastable flags are preserved by a
@@ -3422,7 +3425,8 @@ class T_Join_and_Split(unittest.TestCase):
                node.op, self.join_op)]
        f()
-        utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng)
+        utt.verify_grad((lambda a: join(0, a)), [a_val], rng=rng,
+                        mode=self.mode)
        # Should raise an error if length of dimension 0 is not 1
        self.assertRaises(TypeError, a.set_value,
                          rng.rand(2, 4, 1).astype(self.floatX))
@@ -3458,7 +3462,8 @@ class T_Join_and_Split(unittest.TestCase):
        e_val = rng.rand(1, 1, 1, 1, 2, 1).astype(self.floatX)
        f(a_val, b_val, c_val, d_val, e_val)
        utt.verify_grad((lambda a, b, c, d, e: join(0, a, b, c, d, e)),
-                        [a_val, b_val, c_val, d_val, e_val], rng=rng)
+                        [a_val, b_val, c_val, d_val, e_val], rng=rng,
+                        mode=self.mode)
        # Should raise an error if length of dimension 0 is not 1
        bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
        self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -646,7 +646,14 @@ theano.compile.register_shape_i_c_code(
            %(oname)s=(PyArrayObject*)PyArray_EMPTY(0, NULL, NPY_INT64, 0);
        ((npy_int64*)PyArray_DATA(%(oname)s))[0]=PyArray_DIMS(%(iname)s)[%(i)s];
        """,
-        version=2)
+        """
+        if (%(i)s>=PyArray_NDIM(%(iname)s)){
+            PyErr_SetString(PyExc_TypeError,
+                "Number of dimensions lower than expected");
+            %(fail)s
+        }
+        """,
+        version=3)
 # Register TensorType C code for DeepCopyOp
 theano.compile.register_deep_copy_op_c_code(