Merge pull request #1931 from nouiz/eq_computation

Crash fix equal_computations() again and scan opt speed up.

Merge pull request #1931 from nouiz/eq_computation
7af51a52 · abergeron · e493985e · b6672d24 · 7af51a52 · 7af51a52
--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -20,6 +20,12 @@ The most frequent way to control the number of threads used is via the
 threads you want to use before starting the Python process. Some BLAS
 implementations support other environment variables.
+To test if you BLAS support OpenMP/Multiple cores, you can use the theano/misc/check_blas.py scripts from the command line like this::
+    OMP_NUM_THREAD=1 python theano/misc/check_blas.py -q
+    OMP_NUM_THREAD=2 python theano/misc/check_blas.py -q
 Parallel element wise ops with OpenMP
 =====================================
@@ -46,5 +52,13 @@ a slow one) for a vector of size ``openmp_elemwise_minsize`` with and
 without OpenMP and shows the time difference between the cases.
 The only way to control the number of threads used is via the
-``OMP_NUM_THREADS`` environment variable. Set it to the number of threads
+``OMP_NUM_THREADS`` environment variable. Set it to the number of
-you want to use before starting the Python process.
+threads you want to use before starting the Python process. You can
+test this with this command::
+    $OMP_NUM_THREADS=2 python theano/misc/elemwise_openmp_speedup.py
+    #The output
+    Fast op time without openmp 0.000533s with openmp 0.000474s speedup 1.12
+    Slow op time without openmp 0.002987s with openmp 0.001553s speedup 1.92
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -658,6 +658,7 @@ class MergeOptimizer(Optimizer):
        print >> stream, blanc, "  replace_time", replace_time
        print >> stream, blanc, "  validate_time", validate_time
        print >> stream, blanc, "  callback_time", callback_time
+        if callback_time > 1:
            print >> stream, blanc, "  callbacks_time"
            for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
                if i[1] > 0:

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -69,7 +69,9 @@ def remove_constants_and_unused_inputs_scan(node):
                         op.tap_array[:(op.n_mit_mot + op.n_mit_sot)]]))
    st += op.n_sit_sot
    st += op.n_shared_outs
-    op_ins, op_outs = scan_utils.reconstruct_graph(op.inputs, op.outputs)
+    op_ins = op.inputs
+    op_outs = op.outputs
    # Corresponds to the initial states, which should stay untouched.
    # We put those variables aside, and put them back at the end.
@@ -94,25 +96,26 @@ def remove_constants_and_unused_inputs_scan(node):
    all_ins = gof.graph.inputs(op_outs)
    for idx in xrange(op.n_seqs):
-        if (isinstance(node.inputs[idx + 1], tensor.TensorConstant) and
+        node_inp = node.inputs[idx + 1]
-            node.inputs[idx + 1].tag.unique_value is not None):
+        if (isinstance(node_inp, tensor.TensorConstant) and
+            node_inp.tag.unique_value is not None):
            try:
                # This works if input is a constant that has all entries
                # equal
-                givens[op_ins[idx]] = node.inputs[idx + 1].clone()[0]
+                givens[op_ins[idx]] = node_inp.clone()[0]
            except TypeError:
                pass
        elif op_ins[idx] in all_ins:
            # Check for identical other sequence
            identical_seqs = [x for x in nw_outer
                              if scan_utils.equal_computations(
-                                  [x], [node.inputs[idx + 1]])]
+                                  [x], [node_inp])]
            if identical_seqs:
                index = node.inputs.index(identical_seqs[0]) - 1
                givens[op_ins[idx]] = op_ins[index]
            else:
                nw_inner += [op_ins[idx]]
-                nw_outer += [node.inputs[idx + 1]]
+                nw_outer += [node_inp]
    nw_n_seqs = len(nw_inner)
    # Add outputs stuff

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -391,6 +391,7 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
    or `ys`.
    '''
+    assert len(xs) == len(ys)
    if in_xs is None:
        in_xs = []
    if in_ys is None:
@@ -401,47 +402,46 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
            return False
        if y.owner and not x.owner:
            return False
-        if x.owner and y.owner:
+        if x.owner:  # Check above tell that y.owner eval to True too.
            if x.owner.outputs.index(x) != y.owner.outputs.index(y):
                return False
+        if x not in in_xs and x.type != y.type:
+            return False
    if len(in_xs) != len(in_ys):
        return False
    for _x, _y in izip(in_xs, in_ys):
        if _x.type != _y.type:
            return False
-    nds_x = gof.graph.io_toposort(in_xs, xs)
-    nds_y = gof.graph.io_toposort(in_ys, ys)
-    if len(nds_x) != len(nds_y):
-        return False
    common = set(zip(in_xs, in_ys))
-    n_nodes = len(nds_x)
-    cont = True
-    idx = 0
    for dx, dy in izip(xs, ys):
-        if not dx.owner or not dy.owner:
+        # We checked above that both dx and dy have an owner or not
-            if dy.owner or dx.owner:
+        if not dx.owner:
-                return False
+            if (isinstance(dx, tensor.Constant) and
-            elif (isinstance(dx, tensor.Constant) and
                  isinstance(dy, tensor.Constant)):
-                if not (numpy.all(dx.data == dy.data) and
+                if not dx.equals(dy):
-                        dx.type.dtype == dy.type.dtype and
-                        dx.data.shape == dy.data.shape):
                    return False
                else:
                    pass
            elif (dx, dy) not in common and dx != dy:
                return False
-    while cont and idx < n_nodes:
+    nds_x = gof.graph.io_toposort(in_xs, xs)
+    nds_y = gof.graph.io_toposort(in_ys, ys)
+    if len(nds_x) != len(nds_y):
+        return False
+    n_nodes = len(nds_x)
+    idx = 0
+    while idx < n_nodes:
        nd_x = nds_x[idx]
        nd_y = nds_y[idx]
        if nd_x.op != nd_y.op:
-            cont = False
+            return False
        elif len(nd_x.inputs) != len(nd_y.inputs):
-            cont = False
+            return False
        elif len(nd_x.outputs) != len(nd_y.outputs):
-            cont = False
+            return False
        else:
            for dx, dy in izip(nd_x.inputs, nd_y.inputs):
                if (dx, dy) not in common:
@@ -453,14 +453,13 @@ def equal_computations(xs, ys, in_xs=None, in_ys=None):
                            else:
                                pass
                        else:
-                            cont = False
+                            return False
-        if cont:
            for dx, dy in izip(nd_x.outputs, nd_y.outputs):
                common.add((dx, dy))
        idx += 1
-    return cont
+    return True
 def infer_shape(outs, inputs, input_shapes):

--- a/theano/scan_module/tests/test_scan_utils.py
+++ b/theano/scan_module/tests/test_scan_utils.py
+import theano
+from theano.scan_module.scan_utils import equal_computations
+from theano.tensor.type_other import NoneConst
+def test_equal_compuations():
+    # This was a bug report by a Theano user.
+    c = NoneConst
+    assert equal_computations([c], [c])
+    m = theano.tensor.matrix()
+    max_argmax1 = theano.tensor.max_and_argmax(m)
+    max_argmax2 = theano.tensor.max_and_argmax(m)
+    assert equal_computations(max_argmax1, max_argmax2)
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1581,6 +1581,7 @@ class GemmOptimizer(Optimizer):
        print >> stream, blanc, " time_toposort", prof[9]
        print >> stream, blanc, " validate_time", prof[10]
        print >> stream, blanc, " callback_time", prof[11]
+        if prof[11] > 1:
            print >> stream, blanc, " callbacks_time"
            for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
                if i[1] > 0:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2526,6 +2526,10 @@ def local_reshape_lift(node):
        len(node.inputs[0].owner.inputs) == 1):
        r = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
        e = node.inputs[0].owner.op(r)
+        # In rare case the original broadcast was (False, True), but
+        # the new one is (False, False). So don't crash in that case.
+        if e.type != node.outputs[0].type:
+            e = T.patternbroadcast(e, node.outputs[0].broadcastable)
        return [e]
@@ -4937,6 +4941,7 @@ class FusionOptimizer(Optimizer):
        print >> stream, blanc, " nb_inconsistency_replace", prof[3]
        print >> stream, blanc, " validate_time", prof[4]
        print >> stream, blanc, " callback_time", prof[5]
+        if prof[5] > 1:
            print >> stream, blanc, " callbacks_time"
            for i in sorted(prof[6].iteritems(), key=lambda a: a[1]):
                if i[1] > 0: