Merge pull request #4463 from nouiz/scan_reintroduced_benchmark

Scan reintroduced benchmark

Merge pull request #4463 from nouiz/scan_reintroduced_benchmark
8c58dfb8 · Frédéric Bastien · f512a560 · a312daf1 · 8c58dfb8 · 8c58dfb8
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -239,6 +239,14 @@ import theano and print the config variable, as in:
    ``False``, then we will gc the inner of scan after all
    iterations. This is the default.
+.. attribute:: config.scan.debug
+    Bool value, either ``True`` or ``False``
+    Default: ``False``
+    If True, we will print extra scan debug information.
 .. attribute:: openmp
    Bool value: either True or False
@@ -995,3 +1003,17 @@ import theano and print the config variable, as in:
    Bool value, default: False
    If set to True, will preload the C module cache at import time
+.. attribute:: config.traceback.limit
+    Int value, default: 8
+    The number of user stack level to keep for variables.
+.. attribute:: config.traceback.compile_limit
+    Bool value, default: 0
+    The number of user stack level to keep for variables during Theano
+    compilation. If higher then 0, will make us keep Theano internal
+    stack trace.
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1492,7 +1492,7 @@ class FunctionMaker(object):
                # optimize the fgraph
                theano.config.compute_test_value = \
                    theano.config.compute_test_value_opt
-                theano.config.traceback.limit = 0
+                theano.config.traceback.limit = theano.config.traceback.compile_limit
                start_optimizer = time.time()
                # now optimize the graph
@@ -1683,7 +1683,7 @@ class FunctionMaker(object):
        start_import_time = theano.gof.cmodule.import_time
        limit_orig = theano.config.traceback.limit
        try:
-            theano.config.traceback.limit = 0
+            theano.config.traceback.limit = theano.config.traceback.compile_limit
            _fn, _i, _o = self.linker.make_thunk(
                input_storage=input_storage_lists, storage_map=storage_map)
        finally:

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -573,6 +573,17 @@ AddConfigVar(
    IntParam(8),
    in_c_key=False)
+AddConfigVar(
+    'traceback.compile_limit',
+    "The number of stack to trace to keep during compilation. -1 mean all."
+    " If greater then 0, will also make us save Theano internal stack trace.",
+    IntParam(0),
+    in_c_key=False)
+AddConfigVar('experimental.mrg',
+             "Another random number generator that work on the gpu",
+             BoolParam(False))
 AddConfigVar('experimental.unpickle_gpu_on_cpu',
             "Allow unpickling of pickled CudaNdarrays as numpy.ndarrays."
             "This is useful, if you want to open a CudaNdarray without "
@@ -1417,6 +1428,11 @@ AddConfigVar('scan.allow_output_prealloc',
             BoolParam(True),
             in_c_key=False)
+AddConfigVar('scan.debug',
+             "If True, enable extra verbose output related to scan",
+             BoolParam(False),
+             in_c_key=False)
 AddConfigVar('pycuda.init',
             """If True, always initialize PyCUDA when Theano want to
                initilize the GPU.  Currently, we must always initialize

--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -472,7 +472,7 @@ class FunctionGraph(utils.object2):
        self.execute_callbacks('on_change_input', node, i,
                               r, new_r, reason=reason)
        if prune:
-            self.__remove_clients__(r, [], True)
+            self.__remove_clients__(r, [], True, reason=reason)
    # replace #
    def replace(self, r, new_r, reason=None, verbose=None):

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2553,7 +2553,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
        for opt in list_opt:
            ret = opt.transform(node)
            if ret is not False and ret is not None:
-                assert len(ret) == len(node.outputs)
+                assert len(ret) == len(node.outputs), opt
                for k, v in zip(node.outputs, ret):
                    optimized_vars[k] = v
                results = ret

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -304,6 +304,9 @@ class ReplaceValidate(History, Validator):
        chk = fgraph.checkpoint()
        if verbose is None:
            verbose = config.optimizer_verbose
+        if config.scan.debug:
+            scans = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
        for r, new_r in replacements:
            try:
                fgraph.replace(r, new_r, reason=reason, verbose=False)
@@ -337,6 +340,14 @@ class ReplaceValidate(History, Validator):
            if verbose:
                print("validate failed on node %s.\n Reason: %s, %s" % (r, reason, e))
            raise
+        if config.scan.debug:
+            scans2 = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
+            nb = len(scans)
+            nb2 = len(scans2)
+            if nb2 > nb:
+                print("Extra scan introduced", nb, nb2, getattr(reason, 'name', reason), r, new_r)
+            elif nb2 < nb:
+                print("Scan removed", nb, nb2, getattr(reason, 'name', reason), r, new_r)
        if verbose:
            print(reason, r, new_r)
        # The return is needed by replace_all_validate_remove

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -102,6 +102,9 @@ def add_tag_trace(thing, user_line=None):
             "theano/sparse/", "theano\\sparse\\",
             "theano/typed_list/", "theano\\typed_list\\"]
+    if config.traceback.compile_limit > 0:
+        skips = []
    tr = simple_extract_stack(limit=user_line, skips=skips)
    # Different python version use different sementic for
    # limit. python 2.7 include the call to extrack_stack. The -1 get

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -42,10 +42,11 @@ register_transfer(transfer)
 def init_dev(dev, name=None):
    v = pygpu.gpuarray.api_version()
-    if v[0] != -9998:
+    expected = -9998
+    if v[0] != expected:
        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
                           "Make sure Theano and libgpuarray/pygpu "
-                           "are in sync.")
+                           "are in sync. Expected", expected)
    if v[1] < 0:
        raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
                           "Please update libgpuarray/pygpu.")

--- a/theano/sandbox/cuda/dnn_fwd.c
+++ b/theano/sandbox/cuda/dnn_fwd.c
@@ -159,6 +159,36 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
      chosen_algo = CONV_ALGO;
    }
+    if (0){
+      char * a;
+      switch(chosen_algo){
+      case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
+	a = "implicit gemm (0)";
+	break;
+      case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
+	a = "precomp gemm (1)";
+	break;
+      case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
+	a = "gemm (2)";
+	break;
+      case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
+	a = "direct (3)";
+	break;
+      case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
+	a = "fft (4)";
+	break;
+      case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
+	a = "fft tiling (5)";
+	break;
+#if CUDNN_VERSION > 5000
+      case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
+	a = "winograd (6)";
+	break;
+#endif
+      }
+      printf("GpuDNNConv: algo %s\n", a);
+    }
    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // The tiled-FFT implementation (only in V4 onward) does not support

--- a/theano/sandbox/cuda/dnn_gi.c
+++ b/theano/sandbox/cuda/dnn_gi.c
@@ -158,6 +158,30 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
        chosen_algo = CONV_ALGO;
    }
+    if (0){
+      char * a;
+      switch(chosen_algo){
+      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
+	a = "implicit gemm (0)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
+	a = "precomp gemm (1)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
+	a = "fft (2)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
+	a = "fft tiling (3)";
+	break;
+#if CUDNN_VERSION > 5000
+      case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
+	a = "winograd (4)";
+	break;
+#endif
+      }
+      printf("GpuDNNConvGI: algo %s\n", a);
+    }
    // The FFT implementation (only in V3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // The tiled-FFT implementation (only in V4 onward) does not support

--- a/theano/sandbox/cuda/dnn_gw.c
+++ b/theano/sandbox/cuda/dnn_gw.c
@@ -158,6 +158,25 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
        chosen_algo = CONV_ALGO;
    }
+    if (0){
+      char * a;
+      switch(chosen_algo){
+      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
+	a = "algo 0 (0)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
+	a = "algo 1 (1)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
+	a = "fft (2)";
+	break;
+      case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
+	a = "algo 3 (3)";
+	break;
+      }
+      printf("GpuDNNConvGW: algo %s\n", a);
+    }
    // The FFT implementation (only in v3 and onward) does not support strides,
    // 1x1 filters or inputs with a spatial dimension larger than 1024.
    // If the chosen implementation is FFT, validate that it can be used

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -4464,14 +4464,15 @@ class Reshape(Op):
            return [requ]
        else:
            new_dims = [node.inputs[1][i] for i in xrange(self.ndim)]
-            # since new_dims has one negative value (-1), the
+            # since new_dims can have negative value (-1), the
            # multiplication of all values should be negated
            # to give a positive value.
            # To avoid optimization complexity, we avoid checking
            # for the case when there are two or more '-1' values.
+            if self.ndim:
+                rest_size = (mul(*ishapes[0]) // -mul(*new_dims))
            return [tuple([switch(eq(new_dims[i], -1),
-                                  theano.tensor.mul(*ishapes[0]) //
+                                  rest_size,
-                                  (-theano.tensor.mul(*new_dims)),
                                  new_dims[i])
                           for i in xrange(self.ndim)])]

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1512,8 +1512,8 @@ def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
                # when i.owner.inputs[0].type == i.owner.outputs[0].type we
                # will remove that alloc later
                assert i.type.ndim == cmp_op.ndim
-                get_shape = node.fgraph.shape_feature.get_shape
                if theano.config.experimental.local_alloc_elemwise_assert:
+                    get_shape = node.fgraph.shape_feature.get_shape
                    cond = []
                    for idx in xrange(i.type.ndim):
                        if (not i.type.broadcastable[idx] and
@@ -1731,7 +1731,7 @@ compile.optdb.register('local_alloc_empty_to_zeros',
 @register_specialize
 @register_canonicalize
-@gof.local_optimizer([T.shape])
+@gof.local_optimizer([T.Shape])
 def local_shape_to_shape_i(node):
    if node.op == T.shape:
        # This optimization needs ShapeOpt and fgraph.shape_feature
@@ -4759,6 +4759,10 @@ def local_useless_elemwise_comparison(node):
    Elemwise[LT](add([anything that is shapes]), 0) -> Elemwise[zeros](X)
    Elemwise[GE](add([anything that is shapes]), 0) -> Elemwise[ones](X)
+    # Shapes are never negative
+    # Needed by Reshape.infer_shape
+    Elemwise[EQ](Subtensor(Shape(x)), -N) -> Elemwise[zeros](X)
    """
    if not isinstance(node.op, T.Elemwise):
        return
@@ -4834,6 +4838,41 @@ def local_useless_elemwise_comparison(node):
       T.extract_constant(node.inputs[1], only_process_constants=True) == 0:
        return [T.ones_like(node.inputs[0], dtype=node.outputs[0].dtype)]
+    # Elemwise[EQ](Subtensor(Shape(x)), -N)
+    # Elemwise[EQ](somegraph that only depend of shape, -N)
+    # TODO: handle the case where the -N is on either side
+        """
+ |Elemwise{eq,no_inplace} [id B] ''
+ | |Subtensor{int64} [id C] ''
+ | | |Join [id D] ''
+ | | | |TensorConstant{0} [id E]
+ | | | |Subtensor{int64:int64:} [id F] ''
+ | | | | |Shape [id G] ''
+        """
+    def investigate(node):
+        " Return True if values will be shapes, so >= 0"
+        if isinstance(node.op, (T.Shape, Shape_i)):
+            return True
+        elif isinstance(node.op, Subtensor) and node.inputs[0].owner:
+            return investigate(node.inputs[0].owner)
+        elif isinstance(node.op, T.Join):
+            return all(v.owner and
+                       investigate(v.owner) for v in node.inputs[1:])
+        elif isinstance(node.op, MakeVector):
+            return all(v.owner and
+                       investigate(v.owner) for v in node.inputs)
+    if (isinstance(node.op.scalar_op, scalar.EQ) and
+            node.inputs[0].owner and
+            investigate(node.inputs[0].owner)):
+        try:
+            cst = get_scalar_constant_value(node.inputs[1],
+                                            only_process_constants=True)
+            if cst < 0:
+                return [T.zeros_like(node.inputs[0],
+                                     dtype=node.outputs[0].dtype)]
+        except NotScalarConstantError:
+            pass
    return

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -3409,7 +3409,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
                                     sequences=[X],
                                     non_sequences=None)
        Z = X_sum + Y
-        theano.printing.debugprint(Z)
+        # theano.printing.debugprint(Z)
        # here is the output for the debug print:
        """
        Elemwise{add,no_inplace} [id A] ''
@@ -3436,7 +3436,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
        mode = theano.compile.get_default_mode().excluding('fusion')
        f = theano.function([X, Y], Z, mode=mode)
-        theano.printing.debugprint(f, print_type=True)
+        # theano.printing.debugprint(f, print_type=True)
        # here is the output for the debug print:
        """
        Elemwise{Add}[(0, 0)] [id A] <TensorType(float64, vector)> ''   7
@@ -3465,14 +3465,19 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
         > |X[t] [id O] <TensorType(float64, vector)> -> [id E]
        """
-    def assert_eqs_const(self, f, val):
+    def assert_eqs_const(self, f, val, op=deep_copy_op):
        topo = f.maker.fgraph.toposort()
        elem = topo[0]
        assert len(topo) == 1, topo
-        assert elem.op == deep_copy_op, elem.op
+        assert elem.op == op, elem.op
-        assert len(elem.inputs) == 1, elem.inputs
+        if op == deep_copy_op:
-        assert isinstance(elem.inputs[0], T.TensorConstant), elem
+            assert len(elem.inputs) == 1, elem.inputs
-        assert T.extract_constant(elem.inputs[0]) == val, val
+            assert isinstance(elem.inputs[0], T.TensorConstant), elem
+            assert T.extract_constant(elem.inputs[0]) == val, val
+        else:
+            assert len(elem.inputs) == 2, elem.inputs
+            assert isinstance(elem.inputs[0], T.TensorConstant), elem
+            assert T.extract_constant(elem.inputs[0]) == val, val
    def assert_identity(self, f):
        topo = f.maker.fgraph.toposort()
@@ -3552,6 +3557,33 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
        f = theano.function([x, y], T.ge(x.shape[0]+y.shape[0], 0), mode=mode)
        self.assert_eqs_const(f, 1)
+    def test_equality_shapes(self):
+        # Test equality where one sides contain only shapes related
+        # stuff.
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("Skip opt test as the opt is disabled")
+        x = T.vector('x', dtype=config.floatX)
+        for g in [x.shape[0],
+                  Shape_i(0)(x)]:
+            f = theano.function([x], T.eq(g, 0))
+            assert f([3, 3]) == 0
+            assert f([]) == 1
+            f = theano.function([x], T.eq(g, -1))
+            self.assert_eqs_const(f, 0)
+            assert f([3, 3]) == 0
+        g = join(0,
+                 x.shape[0:],  # todo test reshape, dimshuffle
+                 x.shape[0:1])
+        f = theano.function([x], T.eq(g, 0))
+        assert (f([3, 3]) == 0).all()
+        assert (f([]) == 1).all()
+        f = theano.function([x], T.eq(g, -1))
+        self.assert_eqs_const(f, 0, op=T.alloc)
+        assert (f([3, 3]) == 0).all()
    def test_and(self):
        mode = theano.compile.get_default_mode().including('canonicalize')

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -908,8 +908,9 @@ class TensorConstant(_tensor_py_operators, Constant):
        return TensorConstantSignature((self.type, self.data))
    def equals(self, other):
-        # Override Contant.equals to allow to compare with numpy.ndarray
+        # Override Contant.equals to allow to compare with
-        if isinstance(other, numpy.ndarray):
+        # numpy.ndarray, and python type.
+        if isinstance(other, (numpy.ndarray, int, float)):
            # Make a TensorConstant to be able to compare
            other = theano.tensor.basic.constant(other)
        return (isinstance(other, TensorConstant) and