rewrite of ProfileMode to use ProfileStats

2677e15f · James Bergstra · 5ccdcace · 2677e15f · 2677e15f · 2677e15f
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -15,7 +15,7 @@ from theano import gof
 from theano.gof.python25 import partial
 import mode as mode_module
 from io import In, SymbolicInput, SymbolicInputKit, SymbolicOutput
+from theano.configdefaults import config
 import logging
 _logger = logging.getLogger('theano.compile.function_module')
@@ -495,6 +495,7 @@ class Function(object):
        return cpy
    def __call__(self, *args, **kwargs):
+        profile = self.profile
        t0 = time.time()
        # Reinitialize each container's 'provided' counter
@@ -536,8 +537,7 @@ class Function(object):
        for k, arg in kwargs.iteritems():
            self[k] = arg
+        if (not hasattr(self, '_check_for_aliased_inputs') or
-        if ( not hasattr(self, '_check_for_aliased_inputs') or
            self._check_for_aliased_inputs):
            ## Collect aliased inputs among the storage space
            args_share_memory = []
@@ -592,9 +592,21 @@ class Function(object):
                            self.inv_finder[c]))
        # Do the actual work
+        if profile:
            t0_fn = time.time()
+        try:
            self.fn()
-        dt_fn = time.time() - t0_fn
+        except:
+            if hasattr(self.fn, 'position_of_error'):
+                # this is a new vm-provided function
+                # the C VM needs this because the exception manipulation
+                # done by raise_with_op is not implemented in C.
+                gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error])
+            else:
+                # old-style linkers raise their own exceptions
+                raise
+        if profile:
+            profile.vm_call_time += time.time() - t0_fn
        # Retrieve the values that were computed
        outputs = [x.data for x in self.output_storage]
@@ -626,20 +638,18 @@ class Function(object):
                if isinstance(value, gof.Container):
                    value = value.storage[0]
                self[i] = value
        #
        # NOTE: This logic needs to be replicated in
        #       scan.
        #       grep for 'PROFILE_CODE'
        #
+        if profile:
            dt_call=time.time()-t0
-        if hasattr(self.maker.mode,'fct_call_time'):
+            profile.fct_callcount += 1
-            self.maker.mode.fct_call_time[self] += dt_call
+            profile.fct_call_time += dt_call
-            self.maker.mode.fct_call[self] += 1
+            if hasattr(self.fn, 'update_profile'):
+                self.fn.update_profile(profile)
-        self.maker.mode.call_time += dt_call
-        self.maker.mode.fn_time += dt_fn
        if self.return_none:
            return None
@@ -687,9 +697,10 @@ def _pickle_Function(f):
                if (i < j) and isinstance(d_i, numpy.ndarray) and isinstance(d_j, numpy.ndarray):
                    if numpy.may_share_memory(d_i, d_j):
                        if f.pickle_aliased_memory_strategy == 'warn':
-                            _logger.warning('aliased relationship between Function arguments '
+                            _logger.warning(('aliased relationship between'
-                                    'will not be preserved by un-pickling operation')
+                                    ' Function arguments %s, %s'
-                            #_logger.debug(str([d_i, d_j, id(d_i), id(d_j)]))
+                                    ' will not be preserved by un-pickling'
+                                    ' operation') %(str(d_i), str(d_j)))
                        else:
                            raise AliasedMemoryError(d_i, d_j)
@@ -893,7 +904,8 @@ class FunctionMaker(object):
            raise TypeError("Unknown output type: %s (%s)", type(output), output)
    def __init__(self, inputs, outputs,
-            mode = None, accept_inplace = False, function_builder = Function):
+            mode = None, accept_inplace = False, function_builder = Function,
+            profile=None):
        """
        :type inputs: a list of SymbolicInput instances
@@ -908,9 +920,18 @@ class FunctionMaker(object):
        :param accept_inplace: True iff it is acceptable to have inplace operations
                    in the graph from the inputs to the outputs
        """
        mode = mode_module.get_mode(mode)
+        # figure out which profile object to use (if any)
+        # to help with forward-porting ProfileMode,
+        # we allow ProfileMode to provide a ProfileStats object
+        # using this somewhat awkward mechanism.
+        mode_profile = getattr(mode, 'profile', None)
+        if (profile is not None) and (mode_profile is not None):
+            raise TypeError(
+                    'profile passed via both "mode" and "profile" arguments')
+        self.profile = profile = profile or mode_profile
        # Handle the case where inputs and/or outputs is a single Variable (not in a list)
        unpack_single = False
        return_none = False
@@ -951,7 +972,8 @@ class FunctionMaker(object):
            end_optimizer = time.time()
        finally:
            theano.config.compute_test_value = compute_test_value_orig
-        mode.optimizer_time += end_optimizer - start_optimizer
+        if profile:
+            profile.optimizer_time += end_optimizer - start_optimizer
        _logger.debug('Optimizing took %f seconds' % (end_optimizer - start_optimizer))
        #Add deep copy to respect the memory interface
@@ -1031,7 +1053,9 @@ class FunctionMaker(object):
        _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists)
        end_linker = time.time()
        _logger.debug('Linker took %f seconds' % (end_linker - start_linker))
-        self.mode.linker_time += end_linker - start_linker
+        if self.profile:
+            self.profile.linker_time += end_linker - start_linker
+            _fn.time_thunks = profile.flag_time_thunks
        fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self)
        return fn
@@ -1077,7 +1101,7 @@ def check_equal(x, y):
 def register_checker(checker):
    __checkers.insert(0, checker)
-def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None):
+def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None, profile=None):
    """
    Return a Function that will calculate the outputs from the inputs.
@@ -1105,6 +1129,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
    :param accept_inplace:  True iff the graph can contain inplace operations prior to the
    optimization phase (default is False)
+    :param profile: None or ProfileStats instance
    """
    #Every element of the input list will be upgraded to an `In` instance if necessary,
@@ -1130,8 +1156,16 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
        if not mode:
            raise ValueError("Please provide at least one mode.")
        elif len(mode) == 1:
-            fn = FunctionMaker(inputs, outputs, mode[0], accept_inplace = accept_inplace).create(defaults)
+            fn = FunctionMaker(
+                    inputs,
+                    outputs,
+                    mode[0],
+                    accept_inplace = accept_inplace,
+                    profile=profile).create(
+                            defaults)
        else:
+            if profile:
+                raise NotImplementedError('profiling not implemented in this kind of mode')
            #return a different kind of function
            def dup_defaults():
                # TODO This may need to be changed to use containers as defaults.
@@ -1153,19 +1187,18 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
            fn = maker1.create(defaults)
    else:
        Maker = getattr(mode, 'function_maker', FunctionMaker)
-        fn = Maker(inputs, outputs, mode, accept_inplace = accept_inplace).create(defaults)
+        fn = Maker(inputs,
+                outputs,
+                mode,
+                accept_inplace = accept_inplace,
+                profile=profile).create(
+                        defaults)
    t2 = time.time()
-    if hasattr(mode, 'compile_time'):
+    if profile:
-        mode.compile_time+=t2-t1
+        profile.compile_time+=t2-t1
    fn.name = name
-    if hasattr(mode,'fct_call_time'):
-        mode.fct_call_time.setdefault(fn,0)
-    if hasattr(mode,'fct_call'):
-        mode.fct_call.setdefault(fn,0)
    return fn

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -8,6 +8,8 @@ from theano.configparser import config, AddConfigVar, IntParam, BoolParam
 from theano.compile.function_module import FunctionMaker
 run_cthunk = None # Will be imported only when needed.
+from profiling import ProfileStats
 import_time = time.time()
 AddConfigVar('ProfileMode.n_apply_to_print',
@@ -34,24 +36,53 @@ AddConfigVar('ProfileMode.profile_memory',
 class Profile_Maker(FunctionMaker):
    def create(self, input_storage=None, trustme=False):
        ret = super(Profile_Maker,self).create(input_storage, trustme)
+        # create a function-specific storage container for profiling info
+        profile = ProfileStats(atexit_print=False)
+        self.mode.profile_stats[ret] = profile
+        ret.profile = profile
+        #initialize the timers
        for i, node in enumerate(ret.maker.env.toposort()):
-            self.mode.apply_time[(i,node)]=0.0
+            profile.apply_time[node]=0.0
-            assert len(ret.fn.thunk_groups[i])==1
+            profile.outputs_size[node]=[0.0] * len(node.outputs)
-            self.mode.op_cimpl[node.op] = hasattr(ret.fn.thunk_groups[i][0],'cthunk')
+            # a thunk_group is a list of the thunks from each linker
+            # corresponding to the i'th position in the toposort.
+            assert len(ret.fn.thunk_groups[i])==1
+            profile.apply_cimpl[node] = hasattr(
+                    ret.fn.thunk_groups[i][0],
+                    'cthunk')
+        # Here we replace the linker function.
+        # This ugliness makes WrapLinker (an object that *generates*
+        # functions and is not function-specific)  work with ProfileStats
+        # objects which are function-specific.
+        #capture old fn in closure. This is important since new_fn is about to
+        #take its place as ret.fn.
+        ret_fn = ret.fn
+        def new_fn():
+            self.mode.apply_time = self.mode.profile_stats[ret].apply_time
+            self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
+            ret_fn()
+            # delete the old apply_time variable
+            # because it doesn't mean the same thing anymore.
+            # This prevents old code from looking like it still works.
+            del self.mode.apply_time
+            del self.mode.outputs_size
+        ret.fn = new_fn
        return ret
 class ProfileMode(Mode):
    def __init__(self, linker=config.linker, optimizer=config.optimizer):
-        apply_time = {}
-        op_cimpl = {}
-        compile_time = 0 #time passed in theano.function()
-        fct_call_time = {}#time passed inside theano fct call including op time.
-        fct_call = {}
        message=""
-        outputs_size={}
+        profile_stats={}
-        self.__setstate__((linker, optimizer, apply_time, op_cimpl,
+        self.__setstate__((linker,
-                           compile_time, fct_call_time, fct_call, message, outputs_size))
+            optimizer,
+            message,
+            profile_stats))
    def function_maker(self, i,o,m, *args, **kwargs):
        """Return an instance of `Profiler_Maker` which init the count"""
@@ -59,28 +90,24 @@ class ProfileMode(Mode):
        assert m is self
        return Profile_Maker(i, o, self, *args, **kwargs)
-    local_time = property(lambda self: [sum(self.apply_time.values())])
+    def __get_local_time(self):
+        rval = 0
+        for ps in self.profile_stats.values():
+            rval += sum(ps.apply_time.values())
+        return rval
+    local_time = property(__get_local_time)
    def __getstate__(self):
        #print "__getstate__",self.provided_linker,self.provided_optimizer
-        return (self.provided_linker, self.provided_optimizer, self.apply_time,
+        return (self.provided_linker,
-                self.op_cimpl, self.compile_time, self.fct_call_time,
+                self.provided_optimizer,
-                self.fct_call, self.message, self.outputs_size)
+                self.message,
+                self.profile_stats)
    def __setstate__(self, state):
-        linker, optimizer, apply_time, op_cimpl, compile_time, \
+        linker, optimizer, message, profile_stats = state
-                fct_call_time, fct_call, message, outputs_size = state
+        self.message = message
-        self.apply_time = apply_time
+        self.profile_stats = profile_stats
-        self.op_cimpl = op_cimpl
-        self.compile_time = compile_time
-        self.fct_call_time = fct_call_time
-        self.fct_call = fct_call
-        self.call_time = 0
-        self.fn_time = 0
-        self.optimizer_time = 0
-        self.linker_time = 0
-        self.message = ""
-        self.outputs_size = outputs_size
        def profile_thunk(i, node, th):
            """ Profile only the execution time
@@ -102,7 +129,7 @@ class ProfileMode(Mode):
                th()
                dt = time.time() - t0
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        def profile_thunk2(i, node, th):
@@ -149,8 +176,8 @@ class ProfileMode(Mode):
                else:
                    raise Exception("Can't determine the memory size of dtype",o[0].dtype)
                size.append(s)
-            outputs_size[node]=size
+            self.outputs_size[node]=size
-            apply_time[(i,node)] += dt
+            self.apply_time[node] += max(dt, 1e-14)
        self.provided_linker = linker
@@ -182,22 +209,44 @@ class ProfileMode(Mode):
                       Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
                       that are accepted.
        """
+        compile_time = sum([ps.compile_time for ps in self.profile_stats.values()])
+        fct_call = dict([(fn, ps.fct_callcount)
+            for (fn, ps) in self.profile_stats.items()])
+        fct_call_time = dict([(fn, ps.fct_call_time)
+            for (fn, ps) in self.profile_stats.items()])
+        apply_time = {}
+        for fn, ps in self.profile_stats.items():
+            for (i, node) in enumerate(fn.maker.env.toposort()):
+                apply_time[(i, node)] = ps.apply_time[node]
+        for (i,n),t in apply_time.items():
+            if t == 0:
+                print i, n
+        op_cimpl = {}
+        outputs_size = {}
+        for fn, ps in self.profile_stats.items():
+            op_cimpl.update(ps.apply_cimpl)
-        compile_time = self.compile_time
-        fct_call_time = self.fct_call_time
-        fct_call = self.fct_call
-        apply_time = self.apply_time
-        op_cimpl = self.op_cimpl
        message = self.message
-        outputs_size = self.outputs_size
-        other_time = {'linker_time':self.linker_time,
+        outputs_size = {}
-                      'optimizer_time':self.optimizer_time}
+        for fn, ps in self.profile_stats.items():
+            outputs_size.update(ps.outputs_size)
+        other_time = dict(
+                linker_time = sum(
+                    [ps.linker_time for ps in self.profile_stats.values()]),
+                optimizer_time = sum(
+                    [ps.optimizer_time for ps in self.profile_stats.values()]))
        self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
-                            apply_time, op_cimpl, message, outputs_size, other_time,
+                        apply_time, op_cimpl, message, outputs_size,
+                        self.local_time, other_time,
                        **kwargs)
    def print_diff_summary(self, other, **kwargs):
        """ As print_summary, but print the difference on two different profile mode.
        TODO: Also we don't print the Apply-wise summary as it don't work for now.
@@ -240,7 +289,7 @@ class ProfileMode(Mode):
    @staticmethod
    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
                       apply_time, op_cimpl, message, outputs_size,
-                       other_time,
+                       local_time, other_time,
                       n_apply_to_print=config.ProfileMode.n_apply_to_print,
                       n_ops_to_print=config.ProfileMode.n_ops_to_print,
                       print_apply=True,
@@ -256,7 +305,6 @@ class ProfileMode(Mode):
                                whose outputs memory size is lower then that.
        """
-        local_time = sum(apply_time.values())
        total_time = time.time() - import_time
        total_fct_time = sum(fct_call_time.values())
        total_fct_call = sum(fct_call.values())
@@ -312,7 +360,7 @@ class ProfileMode(Mode):
            op_time[op]+=t
            nb_call = [v for k,v in fct_call.items() if k.maker.env is a.env][0]
            if t==0:
-                assert nb_call == 0
+                assert nb_call == 0, nb_call
            else:
                op_call[op] += nb_call
                op_apply[op] += 1
@@ -429,8 +477,8 @@ class ProfileMode(Mode):
        else:
            fct_memory={}#env->dict(node->(outputs size))
            var_mem = {}
-            for node,val in outputs_size.items():
+            for node, val in outputs_size.items():
-                fct_memory.setdefault(node.env,{})
+                fct_memory.setdefault(node.env, {})
                fct_memory[node.env][node]=val
                for out,v in zip(node.outputs,val):
                    var_mem[out]=v
@@ -600,7 +648,7 @@ def atexit_print_default_profile_mode():
    config.mode=PROFILE_MODE
    """
    for prof_mode in prof_mode_instance_to_print:
-        if sum(prof_mode.apply_time.values())>0:
+        if prof_mode.local_time>0:
            prof_mode.print_summary()
 #Register atexit_print_default_profile_mode to have the summary of the

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -17,11 +17,17 @@ __docformat__ = "restructuredtext en"
 import atexit
 import sys
 import theano
+from theano.configparser import AddConfigVar, StrParam, BoolParam
 config = theano.config
 _atexit_print_list = []
 _atexit_print_file = sys.stderr
+AddConfigVar('profiling.time_thunks',
+             """Time individual thunks when profiling""",
+        BoolParam(True))
 def _atexit_print_fn():
    """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
    """
@@ -88,7 +94,9 @@ class ProfileStats(object):
    linker_time = 0.0
    # time spent linking graph (FunctionMaker.create)
-    def __init__(self, atexit_print=True, **kwargs):
+    # param is called flag_time_thunks because most other attributes with time
+    # in the name are times *of* something, rather than configuration flags.
+    def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
        """
        atexit_print - bool. True means that this object will be printed to
                       stderr (using .summary()) at the end of the program.
@@ -99,6 +107,11 @@ class ProfileStats(object):
        self.output_size = {}
        self.apply_time = {}
        self.apply_cimpl = {}
+        self.outputs_size = {}
+        if flag_time_thunks is None:
+            self.flag_time_thunks = config.profiling.time_thunks
+        else:
+            self.flag_time_thunks = flag_time_thunks
        self.__dict__.update(kwargs)
        #print >> sys.stderr, "self.message", self.message
        if atexit_print: