Merge pull request #2485 from nouiz/profile

Profile fix.

Merge pull request #2485 from nouiz/profile
c8f8a276 · abergeron · 85f71330 · d3026c6f · c8f8a276 · c8f8a276
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1559,6 +1559,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
    t2 = time.time()
    if profile:
        profile.compile_time += t2 - t1
+        profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)
    fn.name = name
    fn.maker.fgraph.name = name

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -199,6 +199,11 @@ class ProfileStats(object):
    line_width = config.profiling.output_line_width
+    nb_nodes = -1
+    # The number of nodes in the graph. We need the infomartion
+    # separatly in case we print the profile when the function wasn't
+    # executed or if there is lazy operation in the graph.
    optimizer_profile = None
    # None or tuple (the optimizer, the profile it returned)
@@ -637,7 +642,7 @@ class ProfileStats(object):
                print >> file, '  Time in thunks: %es (%.3f%%)' % (
                    local_time, 100 * local_time / self.fct_call_time)
        print >> file, '  Total compile time: %es' % self.compile_time
-        print >> file, '    Number of Apply nodes: %s' % len(self.apply_time)
+        print >> file, '    Number of Apply nodes: %d' % self.nb_nodes
        print >> file, '    Theano Optimizer time: %es' % self.optimizer_time
        print >> file, '       Theano validate time: %es' % self.validate_time
        print >> file, ('    Theano Linker time (includes C,'
@@ -649,6 +654,9 @@ class ProfileStats(object):
        # The validation time is a subset of optimizer_time
        assert self.validate_time < self.optimizer_time
+    def summary_globals(self, file):
+        print >> file, 'Time in all call to theano.grad() %es' % theano.gradient.grad_time
    def summary_memory(self, file, N=None):
        fct_memory = {}  # fgraph->dict(node->[outputs size])
        fct_shapes = {}  # fgraph->dict(node->[outputs shapes]))
@@ -1204,6 +1212,7 @@ class ProfileStats(object):
    def summary(self, file=sys.stderr, n_ops_to_print=20,
                n_apply_to_print=20):
        self.summary_function(file)
+        self.summary_globals(file)
        local_time = sum(self.apply_time.values())
        if local_time > 0:
            self.summary_class(file, n_ops_to_print)

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -804,7 +804,7 @@ def io_toposort(inputs, outputs, orderings=None):
    """WRITEME
    inputs: a list or tuple of Variable instances
-    outputs: a list or tuple of Variable instances
+    outputs: a list or tuple of Apply instances
    orderings: a dictionary
                key: Apply instance

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -10,6 +10,7 @@ __docformat__ = "restructuredtext en"
 import __builtin__
 from itertools import izip
 import logging
+import time
 import warnings
 _logger = logging.getLogger('theano.gradient')
@@ -36,6 +37,8 @@ tensor = None
 _msg_retType = 'op.grad(...) returned a non-list'
+grad_time = 0
 def format_as(use_list, use_tuple, outputs):
    """
@@ -412,6 +415,7 @@ def grad(cost, wrt, consider_constant=None,
             or Variable in all cases.
    """
+    t0 = time.time()
    global tensor
    if tensor is None:
        from theano import tensor
@@ -483,14 +487,14 @@ def grad(cost, wrt, consider_constant=None,
        if not hasattr(g_var, 'type'):
            raise TypeError('output grads must be theano variables.'
-                'Ambiguous whether %s should be made into tensor'
+                            'Ambiguous whether %s should be made into tensor'
-                ' or sparse theano variable' % str(type(g_var)))
+                            ' or sparse theano variable' % str(type(g_var)))
        if (not isinstance(g_var.type, (NullType, DisconnectedType)) and
            'float' not in str(g_var.type.dtype)):
            raise TypeError("Gradients must always be NullType, "
-                    "DisconnectedType, or continuous, but grad was "
+                            "DisconnectedType, or continuous, but grad was "
-                    "given a known_grad of type "+str(g_var.type))
+                            "given a known_grad of type "+str(g_var.type))
        # DO NOT check that these gradients are equal to 0 if var is int
        # The gradient is allowed to be non-zero on var in that case
@@ -499,12 +503,11 @@ def grad(cost, wrt, consider_constant=None,
        grad_dict[var] = g_var
    def handle_disconnected(var):
            message = ("grad method was asked to compute the gradient "
-                    "with respect to a variable that is not part of "
+                       "with respect to a variable that is not part of "
-                    "the computational graph of the cost, or is used "
+                       "the computational graph of the cost, or is used "
-                    "only by a non-differentiable operator: %s" % var)
+                       "only by a non-differentiable operator: %s" % var)
            if disconnected_inputs == 'ignore':
                pass
            elif disconnected_inputs == 'warn':
@@ -513,9 +516,8 @@ def grad(cost, wrt, consider_constant=None,
                raise DisconnectedInputError(message)
            else:
                raise ValueError("Invalid value for keyword "
-                        "'disconnected_inputs', valid values are "
+                                 "'disconnected_inputs', valid values are "
-                        "'ignore', 'warn' and 'raise'.")
+                                 "'ignore', 'warn' and 'raise'.")
    # variables that do not influence the cost have zero gradient.
    # if wrt is such a variable, populate the grad_dict with this info
@@ -540,7 +542,7 @@ def grad(cost, wrt, consider_constant=None,
            assert g.type.dtype in tensor.float_dtypes
    rval = _populate_grad_dict(var_to_app_to_idx,
-            grad_dict, wrt, cost_name)
+                               grad_dict, wrt, cost_name)
    for i in xrange(len(rval)):
        if isinstance(rval[i].type, DisconnectedType):
@@ -556,8 +558,12 @@ def grad(cost, wrt, consider_constant=None,
        rval = tuple(rval)
    elif not using_list:
        rval, = rval
+    t1 = time.time()
+    global grad_time
+    grad_time += t1 - t0
    return rval
 def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    '''
    With respect to `wrt`, computes gradients of cost and/or from
@@ -565,7 +571,7 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    symbolic digraph.  In other words, computes gradients for a
    subgraph of the symbolic theano function. Ignores all disconnected
    inputs.
    This can be useful when one needs to perform the gradient descent
    iteratively (e.g. one layer at a time in an MLP), or when a
    particular operation is not differentiable in theano
@@ -580,7 +586,7 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    process, to variables, could then be fed into another
    subgraph_grad as `start` with any other `cost` (e.g. weight
    decay).
    In an MLP, we could use subgraph_grad to iteratively backpropagate:
    .. code-block:: python
@@ -611,13 +617,13 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    :type wrt: list of variables
    :param wrt:
      Gradients are computed with respect to `wrt`.
    :type end: list of variables
    :param end:
      Theano variables at which to end gradient descent (they are
      considered constant in theano.grad).  For convenience, the
      gradients with respect to these variables are also returned.
    :type start: dictionary of variables
    :param start:
      If not None, a dictionary mapping variables to their
@@ -625,9 +631,9 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
      are known. These are used to compute the gradients backwards up
      to the variables in `end` (they are used as known_grad in
      theano.grad).
    :type cost: scalar (0-dimensional) variable
-    :param cost: 
+    :param cost:
      Additional costs for which to compute the gradients.  For
      example, these could be weight decay, an l1 constraint, MSE,
      NLL, etc. May optionally be None if start is provided.  Warning
@@ -647,10 +653,10 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
      When True, additionally returns the list of gradients from
      `start` and of `cost`, respectively, with respect to `wrt` (not
      `end`).
    :rtype: Tuple of 2 or 4 Lists of Variables
-    :return: Returns lists of gradients with respect to `wrt` and `end`, 
+    :return: Returns lists of gradients with respect to `wrt` and `end`,
            respectively.
    .. versionadded:: 0.6.1
@@ -660,20 +666,20 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
    assert isinstance(wrt, list)
    if start is not None:
        assert isinstance(start, dict)
    params = list(set(wrt + end))
    start_grads = None
    cost_grads = None
    if start is not None:
        start_grads = list(
            theano.grad(
-                cost=None, wrt=params, known_grads=start, 
+                cost=None, wrt=params, known_grads=start,
-                consider_constant=end, 
+                consider_constant=end,
                disconnected_inputs='ignore'
            )
        )
    if cost is not None:
        cost_grads = list(
            theano.grad(
@@ -682,7 +688,7 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
                disconnected_inputs='ignore'
            )
        )
    grads = None
    if start is None:
        grads = cost_grads
@@ -691,18 +697,18 @@ def subgraph_grad(wrt, end, start=None, cost=None, details=False):
        if cost_grads is not None:
            for i in range(len(grads)):
                grads[i] += cost_grads[i]
    pgrads = OrderedDict(zip(params, grads))
    # separate wrt from end grads:
    wrt_grads = list(pgrads[k] for k in wrt)
    end_grads = list(pgrads[k] for k in end)
    if details:
        return wrt_grads, end_grads, start_grads, cost_grads
    return wrt_grads, end_grads
 def _node_to_pattern(node):
    """ given an apply node, obtain its connection pattern
     this is just a wrapper around Op.connection_pattern
@@ -714,30 +720,31 @@ def _node_to_pattern(node):
        connection_pattern = node.op.connection_pattern(node)
        if not isinstance(connection_pattern, list):
-            raise TypeError("Op.connection_pattern should return " + \
+            raise TypeError(
-                    ("list of list of bool, but for Op=%s" % node.op) +\
+                "Op.connection_pattern should return " +
-                    "got %s with type %s." % (connection_pattern,
+                ("list of list of bool, but for Op=%s" % node.op) +
-                        type(connection_pattern)))
+                "got %s with type %s." % (connection_pattern,
+                                          type(connection_pattern)))
        if len(connection_pattern) != len(node.inputs):
-            raise ValueError('%s.connection_pattern should have %d' %
+            raise ValueError(
-                    (node.op, len(node.inputs)) + ' rows but has %d.' %
+                '%s.connection_pattern should have %d' %
-                    len(connection_pattern))
+                (node.op, len(node.inputs)) + ' rows but has %d.' %
+                len(connection_pattern))
        for ii, output_pattern in enumerate(connection_pattern):
            if not isinstance(output_pattern, list):
-                raise TypeError('%s.connection_pattern should return' %
+                raise TypeError(
-                        node.op + ' a list of lists, but element %d' % ii\
+                    '%s.connection_pattern should return' %
-                        + 'is %s of type %s.' % (output_pattern,
+                    node.op + ' a list of lists, but element %d' % ii
-                            type(output_pattern)))
+                    + 'is %s of type %s.' % (output_pattern,
+                                             type(output_pattern)))
    else:
-        connection_pattern = \
+        connection_pattern = [[True for output in node.outputs]
-            [[True for output in node.outputs]
+                              for ipt in node.inputs]
-                    for ipt in node.inputs]
    assert isinstance(connection_pattern, list)
    assert len(connection_pattern) == len(node.inputs)
    for ii in xrange(len(node.inputs)):
        assert isinstance(connection_pattern[ii], list)
-        assert len(connection_pattern[ii]) == \
+        assert len(connection_pattern[ii]) == len(node.outputs)
-                len(node.outputs)
    return connection_pattern
@@ -792,7 +799,7 @@ def _populate_var_to_app_to_idx(outputs, wrt, consider_constant):
            iter(consider_constant)
        except TypeError:
            raise TypeError('consider_constant must be an iterable collection,'
-                    ' got ' + str(type(consider_constant)))
+                            ' got ' + str(type(consider_constant)))
        for elem in consider_constant:
            if not isinstance(elem, gof.Variable):
                raise TypeError('Elements of consider_constant must be '
@@ -951,26 +958,27 @@ def _populate_grad_dict(var_to_app_to_idx,
            # list of bools indicating if each input is connected to the cost
            inputs_connected = [
-                    (True in [input_to_output and output_to_cost for
+                (True in [input_to_output and output_to_cost for
-                        input_to_output, output_to_cost in
+                          input_to_output, output_to_cost in
-                        zip(input_to_outputs, outputs_connected)]) for
+                          zip(input_to_outputs, outputs_connected)]) for
-                        input_to_outputs in connection_pattern
+                input_to_outputs in connection_pattern
-                    ]
+            ]
            #List of bools indicating if each output is an integer dtype
            output_is_int = [hasattr(output.type, 'dtype') and
-                    output.type.dtype in theano.tensor.discrete_dtypes
+                             output.type.dtype in theano.tensor.discrete_dtypes
-                    for output in node.outputs]
+                             for output in node.outputs]
            #List of bools indicating if each output is NullType
            ograd_is_nan = [isinstance(output.type, NullType)
-                    for output in output_grads]
+                            for output in output_grads]
            # List of bools indicating if each input only has NullType outputs
-            only_connected_to_nan = [(True not in
+            only_connected_to_nan = [
-                [in_to_out and out_to_cost and not out_nan
+                (True not in
-                    for in_to_out, out_to_cost, out_nan in
+                 [in_to_out and out_to_cost and not out_nan
-                    zip(in_to_outs, outputs_connected, ograd_is_nan)])
+                  for in_to_out, out_to_cost, out_nan in
+                  zip(in_to_outs, outputs_connected, ograd_is_nan)])
                for in_to_outs in connection_pattern]
            if True not in inputs_connected:
@@ -1013,8 +1021,6 @@ def _populate_grad_dict(var_to_app_to_idx,
                inputs = [try_to_copy_if_needed(ipt) for ipt in inputs]
                # Build a list of output gradients with the same dtype as
                # the corresponding output variable.
                # If an output is of a float dtype, we want to cast the
@@ -1108,10 +1114,11 @@ def _populate_grad_dict(var_to_app_to_idx,
            # Do type checking on the result
            # List of bools indicating if each input only has integer outputs
-            only_connected_to_int = [(True not in
+            only_connected_to_int = [
-                [in_to_out and out_to_cost and not out_int
+                (True not in
-                    for in_to_out, out_to_cost, out_int in
+                 [in_to_out and out_to_cost and not out_int
-                    zip(in_to_outs, outputs_connected, output_is_int)])
+                  for in_to_out, out_to_cost, out_int in
+                  zip(in_to_outs, outputs_connected, output_is_int)])
                for in_to_outs in connection_pattern]
            for i, term in enumerate(input_grads):
@@ -1122,13 +1129,14 @@ def _populate_grad_dict(var_to_app_to_idx,
                    # used to mean undefined, zero, or disconnected.
                    # We therefore don't allow it because its usage has become
                    # so muddied.
-                    raise TypeError(('%s.grad returned None for' +
+                    raise TypeError(
-                             ' a gradient term, '
+                        ('%s.grad returned None for' +
-                            'this is prohibited. Instead of None,'
+                         ' a gradient term, '
-                            'return zeros_like(input), disconnected_type(),'
+                         'this is prohibited. Instead of None,'
-                            ' or a NullType variable such as those made with '
+                         'return zeros_like(input), disconnected_type(),'
-                            'the grad_undefined or grad_unimplemented helper '
+                         ' or a NullType variable such as those made with '
-                            'functions.') % node.op)
+                         'the grad_undefined or grad_unimplemented helper '
+                         'functions.') % node.op)
                # Check that the gradient term for this input has the right shape
                if hasattr(term, 'shape'):
@@ -1137,18 +1145,18 @@ def _populate_grad_dict(var_to_app_to_idx,
                        i_shape = orig_ipt_v.shape
                        t_shape = term_v.shape
                        if i_shape != t_shape:
-                            raise ValueError("%s.grad returned object of "
+                            raise ValueError(
-                                    "shape %s as gradient term on input %d "
+                                "%s.grad returned object of "
-                                    "of shape %s" % (node.op, t_shape, i,
+                                "shape %s as gradient term on input %d "
-                                        i_shape))
+                                "of shape %s" % (node.op, t_shape, i, i_shape))
                if not isinstance(term.type,
-                        (NullType, DisconnectedType)):
+                                  (NullType, DisconnectedType)):
                    if term.type.dtype not in theano.tensor.float_dtypes:
                        raise TypeError(str(node.op) + '.grad illegally '
-                                ' returned an integer-valued variable.'
+                                        ' returned an integer-valued variable.'
-                                ' (Input index %d, dtype %s)' % (i,
+                                        ' (Input index %d, dtype %s)' % (
-                                    term.type.dtype))
+                                            i, term.type.dtype))
                    if only_connected_to_nan[i]:
                        assert isinstance(term.type, NullType)
@@ -1233,23 +1241,25 @@ def _populate_grad_dict(var_to_app_to_idx,
                        term = access_term_cache(node)[idx]
                        if not isinstance(term, gof.Variable):
-                            raise TypeError("%s.grad returned %s, expected"
+                            raise TypeError(
-                                    " Variable instance." % (str(node.op),
+                                "%s.grad returned %s, expected"
-                                                             type(term)))
+                                " Variable instance." % (str(node.op),
+                                                         type(term)))
                        if isinstance(term.type, NullType):
                            raise NullTypeGradError("tensor.grad "
-                                "encountered a NaN. " +
+                                                    "encountered a NaN. " +
-                                    term.type.why_null)
+                                                    term.type.why_null)
                        #Don't try to sum up DisconnectedType placeholders
                        if isinstance(term.type, DisconnectedType):
                            continue
                        if hasattr(var, 'ndim') and term.ndim != var.ndim:
-                            raise ValueError(("%s.grad returned a term with"
+                            raise ValueError(
-                                " %d dimensions, but %d are required.") % (
+                                ("%s.grad returned a term with"
-                                    str(node.op), term.ndim, var.ndim))
+                                 " %d dimensions, but %d are required.") % (
+                                     str(node.op), term.ndim, var.ndim))
                        terms.append(term)
@@ -1561,12 +1571,13 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    for i, p in enumerate(pt):
        if p.dtype not in ('float32', 'float64'):
-            raise TypeError(('verify_grad can work only with floating point '
+            raise TypeError(
-                'inputs, but input %i has dtype "%s".') % (i, p.dtype))
+                ('verify_grad can work only with floating point '
+                 'inputs, but input %i has dtype "%s".') % (i, p.dtype))
    _type_tol = dict(  # relative error tolerances for different types
-            float32=1e-2,
+        float32=1e-2,
-            float64=1e-4)
+        float64=1e-4)
    if abs_tol is None:
        abs_tol = __builtin__.max(_type_tol[str(p.dtype)] for p in pt)
@@ -1593,7 +1604,8 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
                                 on_unused_input='ignore')
        return f
-    tensor_pt = [TensorType(
+    tensor_pt = [
+        TensorType(
            as_tensor_variable(p).dtype,
            as_tensor_variable(p).broadcastable)(name='input %i' % i)
        for i, p in enumerate(pt)]
@@ -1612,9 +1624,10 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None,
    o_fn_out = o_fn(*[p.copy() for p in pt])
    if isinstance(o_fn_out, tuple) or isinstance(o_fn_out, list):
-        raise TypeError('It seems like you are trying to use verify_grad '
+        raise TypeError(
-                'on an op or a function which outputs a list: there should'
+            'It seems like you are trying to use verify_grad '
-                ' be a single (array-like) output instead')
+            'on an op or a function which outputs a list: there should'
+            ' be a single (array-like) output instead')
    # random_projection should not have elements too small,
    # otherwise too much precision is lost in numerical gradient