Merge pull request #5032 from nouiz/simplify

Speed up the canonizer for big list of num/denum

Merge pull request #5032 from nouiz/simplify
efb4786e · Frédéric Bastien · GitHub · 53ba24bb · f4f8b257 · efb4786e
--- a/theano/gof/fg.py
+++ b/theano/gof/fg.py
@@ -6,7 +6,6 @@ types that it can raise.
 """
 from __future__ import absolute_import, print_function, division
 from collections import OrderedDict
-import sys
 import time
 import traceback

@@ -260,7 +259,7 @@ class FunctionGraph(utils.object2):
        """
        return r.clients

-    def __add_clients__(self, r, new_clients):
+    def __add_client__(self, r, new_client):
        """
        Updates the list of clients of r with new_clients.

@@ -268,20 +267,18 @@ class FunctionGraph(utils.object2):
        ----------
        r
            Variable.
-        new_clients
-            List of (node, i) pairs such that node.inputs[i] is r.
+        new_client
+            (node, i) pair such that node.inputs[i] is r.

        """
-        if set(r.clients).intersection(set(new_clients)):
-            print('ERROR: clients intersect!', file=sys.stderr)
-            print('  RCLIENTS of', r, [(n, i, type(n), id(n))
-                                       for n, i in r.clients], file=sys.stderr)
-            print('  NCLIENTS of', r, [(n, i, type(n), id(n))
-                                       for n, i in new_clients], file=sys.stderr)
-        assert not set(r.clients).intersection(set(new_clients))
-        r.clients += new_clients
+        # Ne need to do the assert as it is always True. The logic
+        # that call __add_client__ is valid. When the client list is
+        # long, the check it time consuming, so we don't enable it by
+        # default.
+        # assert not new_client in r.clients
+        r.clients.append(new_client)

-    def __remove_clients__(self, r, clients_to_remove,
+    def __remove_client__(self, r, client_to_remove,
                          prune=True, reason=None):
        """
        Removes all from the clients list of r.
@@ -296,8 +293,8 @@ class FunctionGraph(utils.object2):
        ----------
        r : Variable
            The clients of r will be removed.
-        clients_to_remove : List of (op, i) pairs
-            List of (op, i) pairs such that node.inputs[i] is not r anymore.
+        client_to_remove : (op, i) pair
+            (op, i) pair such that node.inputs[i] is not r anymore.
        prune : bool
            If prune is True, it remove r from this fgraph if it don't
            have clients left.
@@ -311,9 +308,11 @@ class FunctionGraph(utils.object2):
            clients_to_remove and prune=True will remove r.

        """
-        for entry in clients_to_remove:
-            r.clients.remove(entry)
-            assert entry not in r.clients  # an op,i pair should be unique
+        if client_to_remove:
+            r.clients.remove(client_to_remove)
+            # entry should be uniq in r. No need to assert it as it is
+            # already asserted in __add_client__.
+            # assert entry not in r.clients
        if r.clients:
            return False
        if not prune:
@@ -333,7 +332,7 @@ class FunctionGraph(utils.object2):
                self.execute_callbacks('on_prune', apply_node, reason)

                for i, input in enumerate(apply_node.inputs):
-                    self.__remove_clients__(input, [(apply_node, i)],
+                    self.__remove_client__(input, (apply_node, i),
                                           reason=reason)
        # variable should not have any clients.
        # assert not variable.clients
@@ -431,7 +430,7 @@ class FunctionGraph(utils.object2):
                if input not in self.variables:
                    self.__setup_r__(input)
                    self.variables.add(input)
-                self.__add_clients__(input, [(node, i)])
+                self.__add_client__(input, (node, i))
            assert node.fgraph is self
            self.execute_callbacks('on_import', node, reason)

@@ -470,15 +469,15 @@ class FunctionGraph(utils.object2):
            return

        self.__import_r__(new_r, reason=reason)
-        self.__add_clients__(new_r, [(node, i)])
-        prune = self.__remove_clients__(r, [(node, i)], False)
+        self.__add_client__(new_r, (node, i))
+        prune = self.__remove_client__(r, (node, i), False)
        # Precondition: the substitution is semantically valid
        # However it may introduce cycles to the graph,  in which case the
        # transaction will be reverted later.
        self.execute_callbacks('on_change_input', node, i,
                               r, new_r, reason=reason)
        if prune:
-            self.__remove_clients__(r, [], True, reason=reason)
+            self.__remove_client__(r, None, True, reason=reason)

    # replace #
    def replace(self, r, new_r, reason=None, verbose=None):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -29,18 +29,21 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tests.breakpoint import PdbBreakpoint

 from .type import (GpuArrayType, GpuArrayConstant, get_context,
-                   ContextNotDefined)
+                   ContextNotDefined, move_to_gpu)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
-                        GpuEye, gpu_join, GpuJoin, gpu_alloc_empty, gpu_alloc, gpu_from_host)
+                        GpuEye, gpu_join, GpuJoin, gpu_alloc_empty,
+                        gpu_alloc, gpu_from_host)
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
-                   gpugemm_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
+                   gpugemm_no_inplace, gpugemm_inplace,
+                   gpugemmbatch_no_inplace,
                   gpugemv_no_inplace, gpugemv_inplace)
 from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
-                          gpu_sparse_block_outer, gpu_sparse_block_outer_inplace,
+                          gpu_sparse_block_outer,
+                          gpu_sparse_block_outer_inplace,
                          gpu_sparse_block_gemv, gpu_sparse_block_gemv_inplace)
 from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
                   gpu_crossentropy_softmax_argmax_1hot_with_bias,
@@ -239,9 +242,8 @@ class InputToGpuOptimizer(Optimizer):
            target = getattr(input.tag, 'target', None)
            if target == 'cpu':
                continue
-            # Do not move *int* scalar to the GPU.
            if (isinstance(input.type, tensor.TensorType) and
-                    input.ndim == 0 and 'int' in input.dtype):
+                    not move_to_gpu(input)):
                continue

            try:
@@ -297,10 +299,7 @@ class GraphToGPU(Optimizer):
        # Iterating through inputs of graph
        target = infer_context_name(*fgraph.inputs)
        for i in fgraph.inputs:
-            # Do not move *int* scalar to the GPU.
-            if (isinstance(i.type, tensor.TensorType) and
-                    (i.ndim > 0 or 'int' not in i.dtype) and
-                    "complex" not in i.dtype):
+            if isinstance(i.type, tensor.TensorType) and move_to_gpu(i):
                mapping[i] = i.transfer(getattr(i.tag, 'target', target))
            else:
                mapping[i] = i

--- a/theano/gpuarray/type.py
+++ b/theano/gpuarray/type.py
@@ -22,6 +22,26 @@ except ImportError:
 _context_reg = {}


+def move_to_gpu(data):
+    """
+    Do we want to move this computation to the GPU?
+
+    Currently, we don't move complex and scalar int.
+
+    Parameters
+    ----------
+    data : numpy.ndarray or TensorVariable
+           (it must have dtype and ndim parameter)
+    """
+    # We don't support complex on the GPU
+    if str(data.dtype) in tensor.basic.complex_dtypes:
+        return False
+    # We don't want scalar int on the GPU.
+    if data.ndim == 0 and str(data.dtype) in tensor.basic.discrete_dtypes:
+        return False
+    return True
+
+
 class ContextNotDefined(ValueError):
    pass

@@ -561,16 +581,22 @@ class GpuArraySharedVariable(_operators, SharedVariable):


 GpuArrayType.SharedVariable = GpuArraySharedVariable
+notset = object()


 def gpuarray_shared_constructor(value, name=None, strict=False,
                                allow_downcast=None, borrow=False,
-                                broadcastable=None, target=None):
+                                broadcastable=None, target=notset):
    """
    SharedVariable constructor for GpuArrayType.

    See :func:`theano.shared`.

+    :target: default None
+        The device target. As None is a valid value and we need to
+        differentiate from the parameter notset and None, we use a
+        notset object.
+
    """
    if target == 'gpu' or target == 'cpu':
        raise TypeError('not for me')
@@ -578,6 +604,10 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
    if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
        raise TypeError('ndarray or GpuArray required')

+    if target is notset:
+        target = None
+        if not move_to_gpu(value):
+            raise TypeError('We do not move that data by default to the GPU')
    try:
        get_context(target)
    except ContextNotDefined:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -4751,12 +4751,16 @@ class Canonizer(gof.LocalOptimizer):
            numeric constant. If v is a plain Variable, returns None.

        """
-        if isinstance(v, Variable):
-            try:
-                # As the constant folding is in the canonicalize phase,
-                # We don't need to check all the graph each time.
-                return get_scalar_constant_value(v, only_process_constants=True)
-            except NotScalarConstantError:
+        if isinstance(v, Constant):
+            if getattr(v.tag, 'unique_value', None) is not None:
+                data = v.tag.unique_value
+            else:
+                data = v.data
+            if data.ndim == 0:
+                return data
+            else:
+                return None
+        elif isinstance(v, Variable):
            return None
        else:
            return v
@@ -4790,6 +4794,21 @@ class Canonizer(gof.LocalOptimizer):
        | [a, b], [c, d] -> [a, b], [c, d]

        """
+        ln = len(num)
+        ld = len(denum)
+        if (ld > 2 and ln > 2):
+            # Faster version for "big" inputs.
+            while True:
+                s = set(num)
+                # Inputs can appear multiple times
+                redo = len(s) != len(num)
+                inter = s.intersection(denum)
+                for v in inter:
+                    num.remove(v)
+                    denum.remove(v)
+                if not redo or not inter:
+                    break
+        else:
            for v in list(num):
                if v in denum:
                    num.remove(v)
@@ -4815,9 +4834,8 @@ class Canonizer(gof.LocalOptimizer):
        | [x, 2, y], [z, 2] -> [x, y], [z]

        """
-
        # Lists representing the numerator and denumerator
-        num, denum = list(orig_num), list(orig_denum)
+        num, denum = [], []

        # Lists representing the *constant* elements of num and denum
        numct, denumct = [], []
@@ -4826,15 +4844,16 @@ class Canonizer(gof.LocalOptimizer):
            ct = self.get_constant(v)
            if ct is not None:
                # We found a constant in the numerator!
-                # We remove it from num
-                num.remove(v)
                # We add it to numct
                numct.append(ct)
+            else:
+                num.append(v)
        for v in orig_denum:
            ct = self.get_constant(v)
            if ct is not None:
-                denum.remove(v)
                denumct.append(ct)
+            else:
+                denum.append(v)

        if self.use_reciprocal or num:
            # This will calculate either:

--- a/theano/tensor/tests/test_gc.py
+++ b/theano/tensor/tests/test_gc.py
@@ -89,16 +89,6 @@ def test_gc_never_pickles_temporaries():

        # assert that f() didn't cause the function to grow
        # allow_gc should leave the function un-changed by calling
-        if len_pre_f != len_post_f:
-            for i in range(len_pre_f//100):
-                p1 = pre_f[i*100:(i+1)*100]
-                p2 = post_f[i*100:(i+1)*100]
-                if p1 != p2:
-                    print(i)
-                    print("p1")
-                    print(p1)
-                    print("p2")
-                    print(p2)
        assert len_pre_f == len_post_f, (len_pre_f, len_post_f)

        # assert that g() didn't cause g to grow because temporaries