提交 ca79f02e authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged -- no conflict

...@@ -35,3 +35,5 @@ theano/version.py ...@@ -35,3 +35,5 @@ theano/version.py
theano/version.py.out theano/version.py.out
distribute-*.egg distribute-*.egg
distribute-*.tar.gz distribute-*.tar.gz
out1
out2
...@@ -11,8 +11,6 @@ How should you write your algorithm to make the most of what Theano can do? ...@@ -11,8 +11,6 @@ How should you write your algorithm to make the most of what Theano can do?
Limitations Limitations
----------- -----------
- Conditional control flow is possible but currently not efficient. The current implementation will evaluate both sides of an ``if`` construct (see :func:`tensor.switch`).
- While- or for-Loops within an expression graph are supported, but only via - While- or for-Loops within an expression graph are supported, but only via
the :func:`theano.scan` op (which puts restrictions on how the loop body can the :func:`theano.scan` op (which puts restrictions on how the loop body can
interact with the rest of the graph). interact with the rest of the graph).
......
"""Provides `DebugMode`, an evaluation mode for debugging theano internals.""" """Provides `DebugMode`, an evaluation mode for debugging theano internals.
:TODO: add support for IfElse Op, LazyLinker, PureOp, etc.
"""
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
import time, copy, sys, copy_reg, gc, os import time, copy, sys, copy_reg, gc, os
...@@ -1552,7 +1556,8 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions ...@@ -1552,7 +1556,8 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
def __init__(self, inputs, outputs, optimizer, mode, def __init__(self, inputs, outputs, optimizer, mode,
accept_inplace = False, accept_inplace = False,
function_builder = Function): function_builder = Function,
profile=None):
""" """
:type inputs: a list of SymbolicInput instances :type inputs: a list of SymbolicInput instances
...@@ -1567,7 +1572,7 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions ...@@ -1567,7 +1572,7 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
:note: this function sets TensorType.filter_checks_isfinite when `mode.check_isfinite` is True :note: this function sets TensorType.filter_checks_isfinite when `mode.check_isfinite` is True
""" """
self.profile = profile
# Handle the case where inputs and/or outputs is a single Variable (not in a list) # Handle the case where inputs and/or outputs is a single Variable (not in a list)
unpack_single = False unpack_single = False
return_none = False return_none = False
......
...@@ -7,12 +7,13 @@ _logger = logging.getLogger('theano.compile.function') ...@@ -7,12 +7,13 @@ _logger = logging.getLogger('theano.compile.function')
from io import In from io import In
from function_module import orig_function from function_module import orig_function
from profiling import ProfileStats
from pfunc import pfunc from pfunc import pfunc
from numpy import any #for to work in python 2.4 from numpy import any #for to work in python 2.4
def function(inputs, outputs=None, mode=None, updates=[], givens=[], def function(inputs, outputs=None, mode=None, updates=[], givens=[],
no_default_updates=False, accept_inplace=False, name=None, no_default_updates=False, accept_inplace=False, name=None,
rebuild_strict=True, allow_input_downcast=None): rebuild_strict=True, allow_input_downcast=None, profile=None):
""" """
Return a callable object that will calculate `outputs` from `inputs`. Return a callable object that will calculate `outputs` from `inputs`.
...@@ -62,6 +63,11 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[], ...@@ -62,6 +63,11 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
precise, type. None (default) is almost like False, but allows precise, type. None (default) is almost like False, but allows
downcasting of Python float scalars to floatX. downcasting of Python float scalars to floatX.
:type profile: None, True, or ProfileStats instance
:param profile: accumulate profiling information into a given ProfileStats
instance. If argument is `True` then a new ProfileStats instance will be
used. This profiling object will be available via self.profile.
:note: Regarding givens: Be careful to make sure that these substitutions are :note: Regarding givens: Be careful to make sure that these substitutions are
independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
another expression is undefined. Replacements specified with givens are different from another expression is undefined. Replacements specified with givens are different from
...@@ -88,6 +94,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[], ...@@ -88,6 +94,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
if uses_In or uses_tuple: if uses_In or uses_tuple:
# we must use old semantics in this case. # we must use old semantics in this case.
if profile:
raise NotImplementedError('profiling not supported in old-style function')
if uses_updates or uses_givens: if uses_updates or uses_givens:
raise NotImplementedError("In() instances and tuple inputs triggers the old semantics, which disallow using updates and givens") raise NotImplementedError("In() instances and tuple inputs triggers the old semantics, which disallow using updates and givens")
fn = orig_function(inputs, outputs, fn = orig_function(inputs, outputs,
...@@ -102,7 +110,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[], ...@@ -102,7 +110,8 @@ def function(inputs, outputs=None, mode=None, updates=[], givens=[],
no_default_updates=no_default_updates, no_default_updates=no_default_updates,
accept_inplace=accept_inplace,name=name, accept_inplace=accept_inplace,name=name,
rebuild_strict=rebuild_strict, rebuild_strict=rebuild_strict,
allow_input_downcast=allow_input_downcast) allow_input_downcast=allow_input_downcast,
profile=profile)
# We need to add the flag check_aliased inputs if we have any mutable or # We need to add the flag check_aliased inputs if we have any mutable or
# borrowed used defined inputs # borrowed used defined inputs
fn._check_for_aliased_inputs = check_for_aliased_inputs fn._check_for_aliased_inputs = check_for_aliased_inputs
......
...@@ -5,6 +5,7 @@ __docformat__ = "restructuredtext en" ...@@ -5,6 +5,7 @@ __docformat__ = "restructuredtext en"
import copy import copy
import copy_reg import copy_reg
import cPickle
import itertools import itertools
import time import time
...@@ -15,7 +16,7 @@ from theano import gof ...@@ -15,7 +16,7 @@ from theano import gof
from theano.gof.python25 import partial from theano.gof.python25 import partial
import mode as mode_module import mode as mode_module
from io import In, SymbolicInput, SymbolicInputKit, SymbolicOutput from io import In, SymbolicInput, SymbolicInputKit, SymbolicOutput
from theano.configdefaults import config
import logging import logging
_logger = logging.getLogger('theano.compile.function_module') _logger = logging.getLogger('theano.compile.function_module')
...@@ -331,6 +332,7 @@ class Function(object): ...@@ -331,6 +332,7 @@ class Function(object):
self.unpack_single = unpack_single self.unpack_single = unpack_single
self.return_none = return_none self.return_none = return_none
self.maker = maker self.maker = maker
self.profile = None # reassigned in FunctionMaker.create
# We will be popping stuff off this `containers` object. It is a copy. # We will be popping stuff off this `containers` object. It is a copy.
containers = list(self.input_storage) containers = list(self.input_storage)
...@@ -495,6 +497,7 @@ class Function(object): ...@@ -495,6 +497,7 @@ class Function(object):
return cpy return cpy
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
profile = self.profile
t0 = time.time() t0 = time.time()
# Reinitialize each container's 'provided' counter # Reinitialize each container's 'provided' counter
...@@ -536,8 +539,7 @@ class Function(object): ...@@ -536,8 +539,7 @@ class Function(object):
for k, arg in kwargs.iteritems(): for k, arg in kwargs.iteritems():
self[k] = arg self[k] = arg
if (not hasattr(self, '_check_for_aliased_inputs') or
if ( not hasattr(self, '_check_for_aliased_inputs') or
self._check_for_aliased_inputs): self._check_for_aliased_inputs):
## Collect aliased inputs among the storage space ## Collect aliased inputs among the storage space
args_share_memory = [] args_share_memory = []
...@@ -592,9 +594,21 @@ class Function(object): ...@@ -592,9 +594,21 @@ class Function(object):
self.inv_finder[c])) self.inv_finder[c]))
# Do the actual work # Do the actual work
if profile:
t0_fn = time.time() t0_fn = time.time()
try:
self.fn() self.fn()
dt_fn = time.time() - t0_fn except:
if hasattr(self.fn, 'position_of_error'):
# this is a new vm-provided function
# the C VM needs this because the exception manipulation
# done by raise_with_op is not implemented in C.
gof.vm.raise_with_op(self.fn.nodes[self.fn.position_of_error])
else:
# old-style linkers raise their own exceptions
raise
if profile:
profile.vm_call_time += time.time() - t0_fn
# Retrieve the values that were computed # Retrieve the values that were computed
outputs = [x.data for x in self.output_storage] outputs = [x.data for x in self.output_storage]
...@@ -626,20 +640,18 @@ class Function(object): ...@@ -626,20 +640,18 @@ class Function(object):
if isinstance(value, gof.Container): if isinstance(value, gof.Container):
value = value.storage[0] value = value.storage[0]
self[i] = value self[i] = value
# #
# NOTE: This logic needs to be replicated in # NOTE: This logic needs to be replicated in
# scan. # scan.
# grep for 'PROFILE_CODE' # grep for 'PROFILE_CODE'
# #
if profile:
dt_call=time.time()-t0 dt_call=time.time()-t0
if hasattr(self.maker.mode,'fct_call_time'): profile.fct_callcount += 1
self.maker.mode.fct_call_time[self] += dt_call profile.fct_call_time += dt_call
self.maker.mode.fct_call[self] += 1 if hasattr(self.fn, 'update_profile'):
self.fn.update_profile(profile)
self.maker.mode.call_time += dt_call
self.maker.mode.fn_time += dt_fn
if self.return_none: if self.return_none:
return None return None
...@@ -687,9 +699,10 @@ def _pickle_Function(f): ...@@ -687,9 +699,10 @@ def _pickle_Function(f):
if (i < j) and isinstance(d_i, numpy.ndarray) and isinstance(d_j, numpy.ndarray): if (i < j) and isinstance(d_i, numpy.ndarray) and isinstance(d_j, numpy.ndarray):
if numpy.may_share_memory(d_i, d_j): if numpy.may_share_memory(d_i, d_j):
if f.pickle_aliased_memory_strategy == 'warn': if f.pickle_aliased_memory_strategy == 'warn':
_logger.warning('aliased relationship between Function arguments ' _logger.warning(('aliased relationship between'
'will not be preserved by un-pickling operation') ' Function arguments %s, %s'
#_logger.debug(str([d_i, d_j, id(d_i), id(d_j)])) ' will not be preserved by un-pickling'
' operation') %(str(d_i), str(d_j)))
else: else:
raise AliasedMemoryError(d_i, d_j) raise AliasedMemoryError(d_i, d_j)
...@@ -893,7 +906,8 @@ class FunctionMaker(object): ...@@ -893,7 +906,8 @@ class FunctionMaker(object):
raise TypeError("Unknown output type: %s (%s)", type(output), output) raise TypeError("Unknown output type: %s (%s)", type(output), output)
def __init__(self, inputs, outputs, def __init__(self, inputs, outputs,
mode = None, accept_inplace = False, function_builder = Function): mode = None, accept_inplace = False, function_builder = Function,
profile=None):
""" """
:type inputs: a list of SymbolicInput instances :type inputs: a list of SymbolicInput instances
...@@ -908,10 +922,20 @@ class FunctionMaker(object): ...@@ -908,10 +922,20 @@ class FunctionMaker(object):
:param accept_inplace: True iff it is acceptable to have inplace operations :param accept_inplace: True iff it is acceptable to have inplace operations
in the graph from the inputs to the outputs in the graph from the inputs to the outputs
""" """
mode = mode_module.get_mode(mode) mode = mode_module.get_mode(mode)
# figure out which profile object to use (if any)
# to help with forward-porting ProfileMode,
# we allow ProfileMode to provide a ProfileStats object
# using this somewhat awkward mechanism.
mode_profile = getattr(mode, 'profile', None)
if (profile is not None) and (mode_profile is not None):
raise TypeError(
'profile passed via both "mode" and "profile" arguments')
self.profile = profile = profile or mode_profile
# Handle the case where inputs and/or outputs is a single Variable (not in a list) # Handle the case where inputs and/or outputs is a single Variable (not in a list)
self.orig_outputs = outputs
unpack_single = False unpack_single = False
return_none = False return_none = False
if outputs is None: if outputs is None:
...@@ -951,7 +975,8 @@ class FunctionMaker(object): ...@@ -951,7 +975,8 @@ class FunctionMaker(object):
end_optimizer = time.time() end_optimizer = time.time()
finally: finally:
theano.config.compute_test_value = compute_test_value_orig theano.config.compute_test_value = compute_test_value_orig
mode.optimizer_time += end_optimizer - start_optimizer if profile:
profile.optimizer_time += end_optimizer - start_optimizer
_logger.debug('Optimizing took %f seconds' % (end_optimizer - start_optimizer)) _logger.debug('Optimizing took %f seconds' % (end_optimizer - start_optimizer))
#Add deep copy to respect the memory interface #Add deep copy to respect the memory interface
...@@ -1031,36 +1056,39 @@ class FunctionMaker(object): ...@@ -1031,36 +1056,39 @@ class FunctionMaker(object):
_fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists) _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists)
end_linker = time.time() end_linker = time.time()
_logger.debug('Linker took %f seconds' % (end_linker - start_linker)) _logger.debug('Linker took %f seconds' % (end_linker - start_linker))
self.mode.linker_time += end_linker - start_linker if self.profile:
self.profile.linker_time += end_linker - start_linker
_fn.time_thunks = profile.flag_time_thunks
fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self) fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self)
return fn return fn
def _pickle_FunctionMaker(self):
kwargs = dict(
inputs = self.inputs,
outputs = self.orig_outputs,
mode = self.mode,
accept_inplace = self.accept_inplace,
function_builder = self.function_builder,
profile = self.profile,
)
return (_constructor_FunctionMaker, (kwargs,))
def _pickle_FunctionMaker(fm): def _constructor_FunctionMaker(kwargs):
if fm.return_none: return FunctionMaker(**kwargs)
outputs = None
else:
if fm.unpack_single:
outputs = fm.outputs[0]
else:
outputs = fm.outputs
#backport
#outputs = None if fm.return_none else (fm.outputs[0] if fm.unpack_single else fm.outputs)
rval = (_constructor_FunctionMaker, (fm.inputs, outputs, fm.mode, fm.accept_inplace))
return rval
def _constructor_FunctionMaker(*args):
return FunctionMaker(*args)
copy_reg.pickle(FunctionMaker, _pickle_FunctionMaker) copy_reg.pickle(FunctionMaker, _pickle_FunctionMaker)
def _pickle_slice(s):
return (slice, (s.start, s.stop, s.step))
copy_reg.pickle(slice, _pickle_slice)
try:
# Someone wrote this at one point, and I'm guessing it's because the default
# pickling mechanism doesn't work... so I'm adding a try/except around it.
# This way if the default implementation works we can just use it.
cPickle.dumps(slice(0, 10, 100))
except:
def _pickle_slice(s):
return (slice, (s.start, s.stop, s.step))
copy_reg.pickle(slice, _pickle_slice)
__checkers = [] __checkers = []
...@@ -1077,7 +1105,7 @@ def check_equal(x, y): ...@@ -1077,7 +1105,7 @@ def check_equal(x, y):
def register_checker(checker): def register_checker(checker):
__checkers.insert(0, checker) __checkers.insert(0, checker)
def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None): def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None, profile=None):
""" """
Return a Function that will calculate the outputs from the inputs. Return a Function that will calculate the outputs from the inputs.
...@@ -1105,6 +1133,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None) ...@@ -1105,6 +1133,8 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
:param accept_inplace: True iff the graph can contain inplace operations prior to the :param accept_inplace: True iff the graph can contain inplace operations prior to the
optimization phase (default is False) optimization phase (default is False)
:param profile: None or ProfileStats instance
""" """
#Every element of the input list will be upgraded to an `In` instance if necessary, #Every element of the input list will be upgraded to an `In` instance if necessary,
...@@ -1130,8 +1160,16 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None) ...@@ -1130,8 +1160,16 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
if not mode: if not mode:
raise ValueError("Please provide at least one mode.") raise ValueError("Please provide at least one mode.")
elif len(mode) == 1: elif len(mode) == 1:
fn = FunctionMaker(inputs, outputs, mode[0], accept_inplace = accept_inplace).create(defaults) fn = FunctionMaker(
inputs,
outputs,
mode[0],
accept_inplace = accept_inplace,
profile=profile).create(
defaults)
else: else:
if profile:
raise NotImplementedError('profiling not implemented in this kind of mode')
#return a different kind of function #return a different kind of function
def dup_defaults(): def dup_defaults():
# TODO This may need to be changed to use containers as defaults. # TODO This may need to be changed to use containers as defaults.
...@@ -1153,19 +1191,18 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None) ...@@ -1153,19 +1191,18 @@ def orig_function(inputs, outputs, mode=None, accept_inplace = False, name=None)
fn = maker1.create(defaults) fn = maker1.create(defaults)
else: else:
Maker = getattr(mode, 'function_maker', FunctionMaker) Maker = getattr(mode, 'function_maker', FunctionMaker)
fn = Maker(inputs, outputs, mode, accept_inplace = accept_inplace).create(defaults) fn = Maker(inputs,
outputs,
mode,
accept_inplace = accept_inplace,
profile=profile).create(
defaults)
t2 = time.time() t2 = time.time()
if hasattr(mode, 'compile_time'): if profile:
mode.compile_time+=t2-t1 profile.compile_time+=t2-t1
fn.name = name fn.name = name
if hasattr(mode,'fct_call_time'):
mode.fct_call_time.setdefault(fn,0)
if hasattr(mode,'fct_call'):
mode.fct_call.setdefault(fn,0)
return fn return fn
......
...@@ -4,7 +4,9 @@ import os, logging ...@@ -4,7 +4,9 @@ import os, logging
import numpy, theano import numpy, theano
from theano import gof from theano import gof
from theano.configparser import config, AddConfigVar, StrParam import theano.gof.vm
from theano.configparser import config, AddConfigVar, StrParam, EnumStr
_logger = logging.getLogger('theano.compile.mode') _logger = logging.getLogger('theano.compile.mode')
...@@ -55,7 +57,11 @@ predefined_linkers = { ...@@ -55,7 +57,11 @@ predefined_linkers = {
'c' : gof.CLinker(), 'c' : gof.CLinker(),
'c|py' : gof.OpWiseCLinker(allow_gc=True), 'c|py' : gof.OpWiseCLinker(allow_gc=True),
'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False), 'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False),
'c&py' : gof.DualLinker(checker = check_equal) 'c&py' : gof.DualLinker(checker = check_equal),
'vm' : gof.vm.VM_Linker(allow_gc=True, use_cloop=False),
'cvm' : gof.vm.VM_Linker(allow_gc=True, use_cloop=True),
'vm_nogc' : gof.vm.VM_Linker(allow_gc=False, use_cloop=False),
'cvm_nogc': gof.vm.VM_Linker(allow_gc=False, use_cloop=True),
} }
...@@ -249,6 +255,7 @@ class Mode(object): ...@@ -249,6 +255,7 @@ class Mode(object):
self._optimizer = optimizer self._optimizer = optimizer
self.call_time = 0 self.call_time = 0
self.fn_time = 0 self.fn_time = 0
linker.mode = self #TODO: WHY IS THIS HERE?
self.optimizer_time = 0 self.optimizer_time = 0
self.linker_time = 0 self.linker_time = 0
...@@ -290,15 +297,27 @@ class Mode(object): ...@@ -290,15 +297,27 @@ class Mode(object):
FAST_COMPILE = Mode('py', 'fast_compile') FAST_COMPILE = Mode('py', 'fast_compile')
FAST_RUN = Mode('c|py', 'fast_run') FAST_RUN = Mode('c|py', 'fast_run')
FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run') FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run')
SANITY_CHECK = [Mode('c|py', None),
Mode('c|py', 'fast_run')]
STABILIZE = Mode("c|py", OPT_STABILIZE) STABILIZE = Mode("c|py", OPT_STABILIZE)
predefined_modes = {'FAST_COMPILE': FAST_COMPILE, predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
'FAST_RUN': FAST_RUN, 'FAST_RUN': FAST_RUN,
'FAST_RUN_NOGC':FAST_RUN_NOGC, 'FAST_RUN_NOGC':FAST_RUN_NOGC,
'SANITY_CHECK': SANITY_CHECK, 'STABILIZE': STABILIZE,
'STABILIZE': STABILIZE} 'VM':Mode('vm', 'fast_run'),
'VM_NOGC':Mode('vm_nogc', 'fast_run'),
'CVM':Mode('cvm', 'fast_run'),
'CVM_NOGC':Mode('cvm_nogc', 'fast_run'),
}
#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
#The old all capital letter way of working is deprecated as it is not scalable.
AddConfigVar('mode',
"Default compilation mode",
EnumStr(*(predefined_modes.keys() + [
'Mode','DEBUG_MODE', 'PROFILE_MODE'])),
in_c_key=False)
instanciated_default_mode=None instanciated_default_mode=None
def get_mode(orig_string): def get_mode(orig_string):
...@@ -329,7 +348,7 @@ def get_mode(orig_string): ...@@ -329,7 +348,7 @@ def get_mode(orig_string):
ret = DebugMode(optimizer=config.optimizer) ret = DebugMode(optimizer=config.optimizer)
else: else:
# The import is needed in case string is ProfileMode # The import is needed in case string is ProfileMode
from profilemode import ProfileMode from profilemode import ProfileMode,prof_mode_instance_to_print
ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)') ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)')
elif predefined_modes.has_key(string): elif predefined_modes.has_key(string):
ret = predefined_modes[string] ret = predefined_modes[string]
...@@ -349,7 +368,6 @@ def get_mode(orig_string): ...@@ -349,7 +368,6 @@ def get_mode(orig_string):
#must tell python to print the summary at the end. #must tell python to print the summary at the end.
if string == 'ProfileMode': if string == 'ProfileMode':
#need to import later to break circular dependency. #need to import later to break circular dependency.
from profilemode import prof_mode_instance_to_print
prof_mode_instance_to_print.append(ret) prof_mode_instance_to_print.append(ret)
return ret return ret
...@@ -365,3 +383,4 @@ def register_mode(name, mode): ...@@ -365,3 +383,4 @@ def register_mode(name, mode):
if name in predefined_modes: if name in predefined_modes:
raise ValueError('Mode name already taken: %s' % name) raise ValueError('Mode name already taken: %s' % name)
predefined_modes[name] = mode predefined_modes[name] = mode
"""Provide a simple user friendly API """ """Provide a simple user friendly API """
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import numpy # for backport to 2.4, to get any().
from profiling import ProfileStats
from theano.gof import Container, Variable, generic, graph, Constant, Value from theano.gof import Container, Variable, generic, graph, Constant, Value
from theano.compile import orig_function, In, Out from theano.compile import orig_function, In, Out
from theano.compile.sharedvalue import SharedVariable, shared from theano.compile.sharedvalue import SharedVariable, shared
import numpy # for backport to 2.4, to get any(). from theano import config
def rebuild_collect_shared( outputs def rebuild_collect_shared( outputs
, inputs = None , inputs = None
...@@ -292,7 +295,8 @@ class Param(object): ...@@ -292,7 +295,8 @@ class Param(object):
def pfunc(params, outputs=None, mode=None, updates=[], givens=[], def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
no_default_updates=False, accept_inplace=False, name=None, no_default_updates=False, accept_inplace=False, name=None,
rebuild_strict=True, allow_input_downcast=None): rebuild_strict=True, allow_input_downcast=None,
profile=None):
"""Function-constructor for graphs with shared variables. """Function-constructor for graphs with shared variables.
:type params: list of either Variable or Param instances. :type params: list of either Variable or Param instances.
...@@ -319,11 +323,9 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[], ...@@ -319,11 +323,9 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
If False (default), perform them all. Else, perform automatic updates on all Variables If False (default), perform them all. Else, perform automatic updates on all Variables
that are neither in "updates" nor in "no_default_updates". that are neither in "updates" nor in "no_default_updates".
:param name: an optional name for this fct. If used, the profile mode will print the time spent in this fct. :type name: None or string
:param name: attaches a name to the Profiling result of this function when
:rtype: theano.compile.Function using ProfileMode (will be deprecated).
:returns: a callable object that will compute the outputs (given the inputs)
and update the implicit function arguments according to the `updates`.
:type allow_input_downcast: Boolean :type allow_input_downcast: Boolean
:param allow_input_downcast: True means that the values passed as :param allow_input_downcast: True means that the values passed as
...@@ -333,6 +335,21 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[], ...@@ -333,6 +335,21 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
precise, type. None (default) is almost like False, but allows precise, type. None (default) is almost like False, but allows
downcasting of Python float scalars to floatX. downcasting of Python float scalars to floatX.
:type profile: None, True, str, or ProfileStats instance
:param profile: accumulate profiling information into a given ProfileStats
instance. None is the default, and means to use the value of
config.profile.
If argument is `True` then a new ProfileStats instance will be
used. If argument is a string, a new ProfileStats instance will be created
with that string as its `message` attribute. This profiling object will be
available via self.profile.
:rtype: theano.compile.Function
:returns: a callable object that will compute the outputs (given the inputs)
and update the implicit function arguments according to the `updates`.
:note: Regarding givens: Be careful to make sure that these substitutions are :note: Regarding givens: Be careful to make sure that these substitutions are
independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in independent--behaviour when Var1 of one pair appears in the graph leading to Var2 in
another expression is undefined. Replacements specified with givens are different from another expression is undefined. Replacements specified with givens are different from
...@@ -354,6 +371,17 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[], ...@@ -354,6 +371,17 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
# Then it clones the outputs and the update expressions. This rebuilds a computation graph # Then it clones the outputs and the update expressions. This rebuilds a computation graph
# from the inputs and the givens. # from the inputs and the givens.
# #
if profile is None:
profile = config.profile
# profile -> True or False
if profile == True:
profile = ProfileStats(message=name)
# profile -> object
if type(profile) == str:
profile = ProfileStats(message=profile)
# profile is typically either False or an object at this point.
# No need to block other objects being passed through though. It might be
# useful.
if not isinstance(params,(list,tuple)): if not isinstance(params,(list,tuple)):
raise Exception("in pfunc() the first argument must be a list or a tuple") raise Exception("in pfunc() the first argument must be a list or a tuple")
...@@ -393,7 +421,7 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[], ...@@ -393,7 +421,7 @@ def pfunc(params, outputs=None, mode=None, updates=[], givens=[],
inputs.append(si) inputs.append(si)
return orig_function(inputs, cloned_outputs, mode, return orig_function(inputs, cloned_outputs, mode,
accept_inplace=accept_inplace, name=name) accept_inplace=accept_inplace, name=name, profile=profile)
def _pfunc_param_to_in(param, strict=False, allow_downcast=None): def _pfunc_param_to_in(param, strict=False, allow_downcast=None):
......
...@@ -8,6 +8,8 @@ from theano.configparser import config, AddConfigVar, IntParam, BoolParam ...@@ -8,6 +8,8 @@ from theano.configparser import config, AddConfigVar, IntParam, BoolParam
from theano.compile.function_module import FunctionMaker from theano.compile.function_module import FunctionMaker
run_cthunk = None # Will be imported only when needed. run_cthunk = None # Will be imported only when needed.
from profiling import ProfileStats
import_time = time.time() import_time = time.time()
AddConfigVar('ProfileMode.n_apply_to_print', AddConfigVar('ProfileMode.n_apply_to_print',
...@@ -34,24 +36,53 @@ AddConfigVar('ProfileMode.profile_memory', ...@@ -34,24 +36,53 @@ AddConfigVar('ProfileMode.profile_memory',
class Profile_Maker(FunctionMaker): class Profile_Maker(FunctionMaker):
def create(self, input_storage=None, trustme=False): def create(self, input_storage=None, trustme=False):
ret = super(Profile_Maker,self).create(input_storage, trustme) ret = super(Profile_Maker,self).create(input_storage, trustme)
# create a function-specific storage container for profiling info
profile = ProfileStats(atexit_print=False)
self.mode.profile_stats[ret] = profile
ret.profile = profile
#initialize the timers
for i, node in enumerate(ret.maker.env.toposort()): for i, node in enumerate(ret.maker.env.toposort()):
self.mode.apply_time[(i,node)]=0.0 profile.apply_time[node]=0.0
assert len(ret.fn.thunk_groups[i])==1 profile.outputs_size[node]=[0.0] * len(node.outputs)
self.mode.op_cimpl[node.op] = hasattr(ret.fn.thunk_groups[i][0],'cthunk')
# a thunk_group is a list of the thunks from each linker
# corresponding to the i'th position in the toposort.
assert len(ret.fn.thunk_groups[i])==1
profile.apply_cimpl[node] = hasattr(
ret.fn.thunk_groups[i][0],
'cthunk')
# Here we replace the linker function.
# This ugliness makes WrapLinker (an object that *generates*
# functions and is not function-specific) work with ProfileStats
# objects which are function-specific.
#capture old fn in closure. This is important since new_fn is about to
#take its place as ret.fn.
ret_fn = ret.fn
def new_fn():
self.mode.apply_time = self.mode.profile_stats[ret].apply_time
self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
ret_fn()
# delete the old apply_time variable
# because it doesn't mean the same thing anymore.
# This prevents old code from looking like it still works.
del self.mode.apply_time
del self.mode.outputs_size
ret.fn = new_fn
return ret return ret
class ProfileMode(Mode): class ProfileMode(Mode):
def __init__(self, linker=config.linker, optimizer=config.optimizer): def __init__(self, linker=config.linker, optimizer=config.optimizer):
apply_time = {}
op_cimpl = {}
compile_time = 0 #time passed in theano.function()
fct_call_time = {}#time passed inside theano fct call including op time.
fct_call = {}
message="" message=""
outputs_size={} profile_stats={}
self.__setstate__((linker, optimizer, apply_time, op_cimpl, self.__setstate__((linker,
compile_time, fct_call_time, fct_call, message, outputs_size)) optimizer,
message,
profile_stats))
def function_maker(self, i,o,m, *args, **kwargs): def function_maker(self, i,o,m, *args, **kwargs):
"""Return an instance of `Profiler_Maker` which init the count""" """Return an instance of `Profiler_Maker` which init the count"""
...@@ -59,28 +90,24 @@ class ProfileMode(Mode): ...@@ -59,28 +90,24 @@ class ProfileMode(Mode):
assert m is self assert m is self
return Profile_Maker(i, o, self, *args, **kwargs) return Profile_Maker(i, o, self, *args, **kwargs)
local_time = property(lambda self: [sum(self.apply_time.values())]) def __get_local_time(self):
rval = 0
for ps in self.profile_stats.values():
rval += sum(ps.apply_time.values())
return rval
local_time = property(__get_local_time)
def __getstate__(self): def __getstate__(self):
#print "__getstate__",self.provided_linker,self.provided_optimizer #print "__getstate__",self.provided_linker,self.provided_optimizer
return (self.provided_linker, self.provided_optimizer, self.apply_time, return (self.provided_linker,
self.op_cimpl, self.compile_time, self.fct_call_time, self.provided_optimizer,
self.fct_call, self.message, self.outputs_size) self.message,
self.profile_stats)
def __setstate__(self, state): def __setstate__(self, state):
linker, optimizer, apply_time, op_cimpl, compile_time, \ linker, optimizer, message, profile_stats = state
fct_call_time, fct_call, message, outputs_size = state self.message = message
self.apply_time = apply_time self.profile_stats = profile_stats
self.op_cimpl = op_cimpl
self.compile_time = compile_time
self.fct_call_time = fct_call_time
self.fct_call = fct_call
self.call_time = 0
self.fn_time = 0
self.optimizer_time = 0
self.linker_time = 0
self.message = ""
self.outputs_size = outputs_size
def profile_thunk(i, node, th): def profile_thunk(i, node, th):
""" Profile only the execution time """ Profile only the execution time
...@@ -102,7 +129,7 @@ class ProfileMode(Mode): ...@@ -102,7 +129,7 @@ class ProfileMode(Mode):
th() th()
dt = time.time() - t0 dt = time.time() - t0
apply_time[(i,node)] += dt self.apply_time[node] += max(dt, 1e-14)
def profile_thunk2(i, node, th): def profile_thunk2(i, node, th):
...@@ -149,8 +176,8 @@ class ProfileMode(Mode): ...@@ -149,8 +176,8 @@ class ProfileMode(Mode):
else: else:
raise Exception("Can't determine the memory size of dtype",o[0].dtype) raise Exception("Can't determine the memory size of dtype",o[0].dtype)
size.append(s) size.append(s)
outputs_size[node]=size self.outputs_size[node]=size
apply_time[(i,node)] += dt self.apply_time[node] += max(dt, 1e-14)
self.provided_linker = linker self.provided_linker = linker
...@@ -182,22 +209,44 @@ class ProfileMode(Mode): ...@@ -182,22 +209,44 @@ class ProfileMode(Mode):
Currently there is n_apply_to_print, n_ops_to_print and min_memory_size Currently there is n_apply_to_print, n_ops_to_print and min_memory_size
that are accepted. that are accepted.
""" """
compile_time = sum([ps.compile_time for ps in self.profile_stats.values()])
fct_call = dict([(fn, ps.fct_callcount)
for (fn, ps) in self.profile_stats.items()])
fct_call_time = dict([(fn, ps.fct_call_time)
for (fn, ps) in self.profile_stats.items()])
apply_time = {}
for fn, ps in self.profile_stats.items():
for (i, node) in enumerate(fn.maker.env.toposort()):
apply_time[(i, node)] = ps.apply_time[node]
for (i,n),t in apply_time.items():
if t == 0:
print i, n
op_cimpl = {}
outputs_size = {}
for fn, ps in self.profile_stats.items():
op_cimpl.update(ps.apply_cimpl)
compile_time = self.compile_time
fct_call_time = self.fct_call_time
fct_call = self.fct_call
apply_time = self.apply_time
op_cimpl = self.op_cimpl
message = self.message message = self.message
outputs_size = self.outputs_size
other_time = {'linker_time':self.linker_time, outputs_size = {}
'optimizer_time':self.optimizer_time} for fn, ps in self.profile_stats.items():
outputs_size.update(ps.outputs_size)
other_time = dict(
linker_time = sum(
[ps.linker_time for ps in self.profile_stats.values()]),
optimizer_time = sum(
[ps.optimizer_time for ps in self.profile_stats.values()]))
self.print_summary_("print_summary", compile_time, fct_call_time, fct_call, self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size, other_time, apply_time, op_cimpl, message, outputs_size,
self.local_time, other_time,
**kwargs) **kwargs)
def print_diff_summary(self, other, **kwargs): def print_diff_summary(self, other, **kwargs):
""" As print_summary, but print the difference on two different profile mode. """ As print_summary, but print the difference on two different profile mode.
TODO: Also we don't print the Apply-wise summary as it don't work for now. TODO: Also we don't print the Apply-wise summary as it don't work for now.
...@@ -240,7 +289,7 @@ class ProfileMode(Mode): ...@@ -240,7 +289,7 @@ class ProfileMode(Mode):
@staticmethod @staticmethod
def print_summary_(fct_name, compile_time, fct_call_time, fct_call, def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size, apply_time, op_cimpl, message, outputs_size,
other_time, local_time, other_time,
n_apply_to_print=config.ProfileMode.n_apply_to_print, n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print, n_ops_to_print=config.ProfileMode.n_ops_to_print,
print_apply=True, print_apply=True,
...@@ -256,7 +305,6 @@ class ProfileMode(Mode): ...@@ -256,7 +305,6 @@ class ProfileMode(Mode):
whose outputs memory size is lower then that. whose outputs memory size is lower then that.
""" """
local_time = sum(apply_time.values())
total_time = time.time() - import_time total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values()) total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values()) total_fct_call = sum(fct_call.values())
...@@ -312,7 +360,7 @@ class ProfileMode(Mode): ...@@ -312,7 +360,7 @@ class ProfileMode(Mode):
op_time[op]+=t op_time[op]+=t
nb_call = [v for k,v in fct_call.items() if k.maker.env is a.env][0] nb_call = [v for k,v in fct_call.items() if k.maker.env is a.env][0]
if t==0: if t==0:
assert nb_call == 0 assert nb_call == 0, nb_call
else: else:
op_call[op] += nb_call op_call[op] += nb_call
op_apply[op] += 1 op_apply[op] += 1
...@@ -429,8 +477,8 @@ class ProfileMode(Mode): ...@@ -429,8 +477,8 @@ class ProfileMode(Mode):
else: else:
fct_memory={}#env->dict(node->(outputs size)) fct_memory={}#env->dict(node->(outputs size))
var_mem = {} var_mem = {}
for node,val in outputs_size.items(): for node, val in outputs_size.items():
fct_memory.setdefault(node.env,{}) fct_memory.setdefault(node.env, {})
fct_memory[node.env][node]=val fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val): for out,v in zip(node.outputs,val):
var_mem[out]=v var_mem[out]=v
...@@ -600,7 +648,7 @@ def atexit_print_default_profile_mode(): ...@@ -600,7 +648,7 @@ def atexit_print_default_profile_mode():
config.mode=PROFILE_MODE config.mode=PROFILE_MODE
""" """
for prof_mode in prof_mode_instance_to_print: for prof_mode in prof_mode_instance_to_print:
if sum(prof_mode.apply_time.values())>0: if prof_mode.local_time>0:
prof_mode.print_summary() prof_mode.print_summary()
#Register atexit_print_default_profile_mode to have the summary of the #Register atexit_print_default_profile_mode to have the summary of the
......
"""ProfileStats object for runtime and memory profiling.
"""
#
# TODO: measure memory usage like ProfileMode did
# TODO: put the optimization tips into a tips section??
# TODO: add tip to use specify_shape (is specify_shape even in library doc?)
# TODO: ensure field width for string fields makes columns line up
# TODO: what to do about 'diff summary'? (ask Fred?)
#
__authors__ = "James Bergstra"
__reviewer__ = "Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en"
import atexit
import sys
import theano
from theano.configparser import AddConfigVar, StrParam, BoolParam
config = theano.config
_atexit_print_list = []
_atexit_print_file = sys.stderr
AddConfigVar('profiling.time_thunks',
"""Time individual thunks when profiling""",
BoolParam(True))
def _atexit_print_fn():
"""Print ProfileStat objects in _atexit_print_list to _atexit_print_file
"""
for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0:
ps.summary(file=_atexit_print_file)
else:
print 'Skipping empty Profile'
atexit.register(_atexit_print_fn)
class ProfileStats(object):
"""
Object to store runtime and memory profiling information for all of
Theano's operations: compilation, optimization, execution.
"""
#
# Note on implementation:
# Class variables are used here so that each one can be
# documented and initialized together.
# dictionary variables are initialized with None.
#
compile_time = 0.0
# Total time spent in body of orig_function,
# dominated by graph optimization and compilation of C
#
fct_call_time = 0.0
# The total time spent in Function.__call__
#
fct_callcount = 0
# Number of calls to Function.__call__
#
vm_call_time = 0.0
# Total time spent in Function.fn.__call__
#
apply_time = None
# dict from node -> float runtime
#
apply_callcount = None
# dict from node -> number of executions
#
apply_cimpl = None
# dict from node -> bool (1 if c, 0 if py)
#
message = None
# pretty string to print in summary, to identify this output
#
outputs_size = None
# node -> size of allocated output
#
optimizer_time = 0.0
# time spent optimizing graph (FunctionMaker.__init__)
linker_time = 0.0
# time spent linking graph (FunctionMaker.create)
# param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags.
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
"""
atexit_print - bool. True means that this object will be printed to
stderr (using .summary()) at the end of the program.
**kwargs - misc initializers. These should (but need not) match the
names of the class vars declared in this class.
"""
self.apply_callcount = {}
self.output_size = {}
self.apply_time = {}
self.apply_cimpl = {}
self.outputs_size = {}
if flag_time_thunks is None:
self.flag_time_thunks = config.profiling.time_thunks
else:
self.flag_time_thunks = flag_time_thunks
self.__dict__.update(kwargs)
#print >> sys.stderr, "self.message", self.message
if atexit_print:
global _atexit_print_list
_atexit_print_list.append(self)
def op_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, t in self.apply_time.items():
rval.setdefault(node.op, 0)
rval[node.op] += t
return rval
def op_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += count
return rval
def op_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
def op_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node in self.apply_callcount:
if self.apply_cimpl[node]:
rval[node.op] = 'C '
else:
rval[node.op] = 'Py'
return rval
def op_flops(self):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
return rval #TODO: continue here
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
for a,t in op_time.items():
if hasattr(a,'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6
flops_msg=''
if op_flops:
flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_ops: total time 0'
' (did you forget to enable counters?)')
return
op_time = self.op_time()
op_call = self.op_callcount()
op_apply = self.op_nodes()
op_flops = self.op_flops()
op_impl = self.op_impl()
if N is None:
N = len(self.op_flops)
otimes = [(t*100/local_time,
t,
op,
op_impl.get(op, ' '),
op_call.get(op, 0),
op_apply.get(op,0))
for op, t in op_time.items()]
otimes.sort()
otimes.reverse()
tot=0
print >> file, 'Ops'
print >> file, '---'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if op_flops:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
else:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
def summary_nodes(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_nodes: total time 0'
' (did you forget to enable counters?)')
return
print >> file, 'Thunks'
print >> file, '------'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Apply Op name>'
atimes = [(
t*100/local_time,
t,
a,
self.apply_callcount[a])
for a, t in self.apply_time.items()]
atimes.sort()
atimes.reverse()
tot=0
for (f, t, a, nb_call) in atimes[:N]:
tot+=t
ftot=tot*100/local_time
if nb_call==0:
continue
print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-N),
sum(f for f, t, a, nb_call in atimes[N:]),
sum(t for f, t, a, nb_call in atimes[N:]))
print >> file, ''
def summary_function(self, file):
print >> file, 'Function profiling'
print >> file, '=================='
print >> file, ' Message: %s'%self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time)
if self.fct_call_time>0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' %(
self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values())
if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' %(
local_time, 100*local_time / self.fct_call_time)
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
else:
print >> file, " No node time accumulated (hint: try config profiling.time_thunks=1)"
if 0: # old code still to be ported from ProfileMode
def long_print(self, file=sys.stderr, fct_name=None, message=None,
n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
"""
Print a readable summary of the stats.
param: n_apply_to_print the number of apply to print. Default 15.
param: n_ops_to_print the number of ops to print. Default 20.
"""
local_time = sum(self.apply_time.values())
print ''
print 'ProfileMode.long_print()'
print 'name = %s'%fct_name
print 'msg = %s'%message
print '---------------------------'
print ''
print 'Total time spent running thunks: %.3fs'% local_time
sop_time={}
sop_call={}
sop_op = {}
sop_c={} #map each op class to Bool. True iff all applies were done in c.
for a,t in op_time.items():
typ = type(a)
sop_time.setdefault(typ,0)
sop_time[typ]+=t
sop_op.setdefault(typ,0)
sop_op[typ]+=1
sop_c.setdefault(typ,True)
sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ]=sop_call.get(typ,0)+op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes.sort()
sotimes.reverse()
tot=0
for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if ci:
msg = '*'
else:
msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values())
other_time = total_time - total_fct_time - compile_time
print
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys():
if fct_call[key]>0:
print ' %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
fct_call_time[key]/fct_call[key], fct_call[key], key.name)
else:
print ' NOT CALLED',key.name
if total_fct_time>0:
time_pr_in_fct=local_time/total_fct_time*100
time_per_call=total_fct_time/total_fct_call
else:
time_pr_in_fct=0
time_per_call=0
print
print 'Time since import %.3fs'%(total_time)
print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
print
print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.env.toposort()):
if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu=[]
gpu=[]
trans=[]
for so in sotimes:
if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
trans.append(so)
elif so[2].__name__.startswith("Gpu"):
gpu.append(so)
else:
cpu.append(so)
sum_cpu=sum(so[1] for so in cpu)
sum_gpu=sum(so[1] for so in gpu)
sum_trans=sum(so[1] for so in trans)
print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
print "Theano function input that are float64"
print "<fct name> <input name> <input type> <str input>"
for fct in fct_call.keys():
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#env->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.env,{})
fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for env,nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = env.toposort()
computed, last_user = gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in env.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print
print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
from theano import tensor as T
from theano.tensor.raw_random import RandomFunction
import theano
import theano.scalar as scal
scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
def get_scalar_ops(s):
if isinstance(s, theano.scalar.Composite):
l = []
for node in s.env.toposort():
l+=get_scalar_ops(node.op)
return l
else: return [s]
def list_scalar_op(op):
if isinstance(op.scalar_op, theano.scalar.Composite):
return get_scalar_ops(op.scalar_op)
else: return [op.scalar_op]
def amdlibm_speed_up(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
for s_op in l:
if s_op.__class__ in scalar_op_amdlibm_speed_up:
return True
elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
import pdb;pdb.set_trace()
print "We don't know if amdlibm will accelerate this scalar op.", s_op
return False
def exp_float32_op(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
return any([s_op.__class__ in [scal.Exp] for s_op in l])
#tip 1
if config.floatX=='float64':
print " - Try the Theano flag floatX=float32"
#tip 2
if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
#tip 3
if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
print " - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
#tip 4
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
print " - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
#tip 5
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, RandomFunction):
print " - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
if config.device.startswith("gpu"):
print " - MRG_RandomStreams is the only random number supported on the GPU."
break
def print_summary(self,
n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print):
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
Their is an hack with the Op-wise summary. Go see it if you want to know more.
:param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
:param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.
"""
fct_call_time = self.mode.fct_call_time
fct_call = self.mode.fct_call
apply_time = self.apply_time
op_cimpl = self.op_cimpl
message = self.message
outputs_size = self.outputs_size
self.print_summary_("print_summary",
None,
None,
None,
apply_time,
op_cimpl,
message,
outputs_size,
n_apply_to_print,
n_ops_to_print)
def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
""" As print_summary, but print the difference on two different profile mode.
TODO: Also we don't print the Apply-wise summary as it don't work for now.
TODO: make comparaison with gpu code.
:param other: the other instance of ProfileMode that we want to be compared to.
:param n_apply_to_print: the number of apply to print. Default 15.
:param n_ops_to_print: the number of ops to print. Default 20.
"""
def diff_dict(a_time,b_time_):
r = {}
b_time = copy.copy(b_time_)
for a,ta in a_time.items():
r.setdefault(a,0)
tb = b_time.pop(a,0)
r[a]+=ta-tb
#they are missing in a
for a,t in b_time.items():
r.setdefault(a,0)
r[a]+=t
return r
compile_time = self.compile_time-other.compile_time
fct_call_time = diff_dict(self.fct_call_time,other.fct_call_time)
fct_call = diff_dict(self.fct_call,other.fct_call)
apply_time = diff_dict(self.apply_time, other.apply_time)
op_cimpl = self.op_cimpl and other.op_cimpl
message = self.message
outputs_size = diff_dict(self.outputs_size,other.outputs_size)
self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False)
"""
Test compilation modes
"""
from nose.plugins.skip import SkipTest
import unittest
import theano
import numpy
import random
import numpy.random
from theano.tests import unittest_tools as utt
import theano.tensor as T
class T_bunch_of_modes(unittest.TestCase):
def test1(self):
# this is a quick test after the LazyLinker branch merge
# to check that all the current modes can still be used.
linker_classes_involved = []
for modename in theano.config.__class__.__dict__['mode'].all:
x = T.matrix()
y = T.vector()
f = theano.function([x,y], x+y, mode=modename)
# test that it runs something
f([[1,2],[3,4]], [5, 6])
linker_classes_involved.append(f.maker.mode.linker.__class__)
print 'MODE:', modename, f.maker.mode.linker, 'stop'
# regression check:
# there should be
# - VM_Linker
# - OpWiseCLinker (FAST_RUN)
# - WrapLinker (PROFILE_MODE)
# - PerformLinker (FAST_COMPILE)
# - DebugMode's Linker (DEBUG_MODE)
assert 5 == len(set(linker_classes_involved))
if __name__ == '__main__':
unittest.main()
...@@ -65,15 +65,6 @@ AddConfigVar('force_device', ...@@ -65,15 +65,6 @@ AddConfigVar('force_device',
BoolParam(False, allow_override=False), BoolParam(False, allow_override=False),
in_c_key=False) in_c_key=False)
#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
#The old all capital letter way of working is deprecated as it is not scalable.
AddConfigVar('mode',
"Default compilation mode",
EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
in_c_key=False)
# Test whether or not gcc is present: disable C code if it is not. # Test whether or not gcc is present: disable C code if it is not.
# Using the dummy file descriptor below is a workaround for a crash experienced # Using the dummy file descriptor below is a workaround for a crash experienced
# in an unusual Python 2.4.4 Windows environment with the default stdin=None. # in an unusual Python 2.4.4 Windows environment with the default stdin=None.
...@@ -84,13 +75,15 @@ try: ...@@ -84,13 +75,15 @@ try:
# Keep the default linker the same as the one for the mode FAST_RUN # Keep the default linker the same as the one for the mode FAST_RUN
AddConfigVar('linker', AddConfigVar('linker',
"Default linker used if the theano flags mode is Mode or ProfileMode", "Default linker used if the theano flags mode is Mode or ProfileMode",
EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'), EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
in_c_key=False) in_c_key=False)
except OSError: except OSError:
# gcc is not present, linker should default to python only # gcc is not present, linker should default to python only
AddConfigVar('linker', AddConfigVar('linker',
"Default linker used if the theano flags mode is Mode or ProfileMode", "Default linker used if the theano flags mode is Mode or ProfileMode",
EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'), EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
in_c_key=False) in_c_key=False)
warning('GCC not detected ! Theano will be unable to execute optimized '+ warning('GCC not detected ! Theano will be unable to execute optimized '+
'C-implementations (for both CPU and GPU) and will default to '+ 'C-implementations (for both CPU and GPU) and will default to '+
...@@ -145,10 +138,6 @@ AddConfigVar('op.set_flops', ...@@ -145,10 +138,6 @@ AddConfigVar('op.set_flops',
BoolParam(False), BoolParam(False),
in_c_key=False) in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False))
AddConfigVar('gpuelemwise.sync', AddConfigVar('gpuelemwise.sync',
"when true, wait that the gpu fct finished and check it error code.", "when true, wait that the gpu fct finished and check it error code.",
BoolParam(True)) BoolParam(True))
......
...@@ -146,7 +146,7 @@ from link import \ ...@@ -146,7 +146,7 @@ from link import \
Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
from op import \ from op import \
Op Op, PureOp
from opt import (Optimizer, optimizer, SeqOptimizer, from opt import (Optimizer, optimizer, SeqOptimizer,
MergeOptimizer, MergeOptMerge, MergeOptimizer, MergeOptMerge,
......
...@@ -1312,6 +1312,7 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[] ...@@ -1312,6 +1312,7 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
#DSE Patch 1 for supporting OSX frameworks; add -framework Python #DSE Patch 1 for supporting OSX frameworks; add -framework Python
if sys.platform=='darwin' : if sys.platform=='darwin' :
preargs.extend(['-undefined','dynamic_lookup']) preargs.extend(['-undefined','dynamic_lookup'])
python_inc = distutils.sysconfig.get_python_inc()
# link with the framework library *if specifically requested* # link with the framework library *if specifically requested*
# config.mac_framework_link is by default False, since on some mac # config.mac_framework_link is by default False, since on some mac
# installs linking with -framework causes a Bus Error # installs linking with -framework causes a Bus Error
......
...@@ -311,6 +311,9 @@ class Env(utils.object2): ...@@ -311,6 +311,9 @@ class Env(utils.object2):
self.__import_r__([new_r]) self.__import_r__([new_r])
self.__add_clients__(new_r, [(node, i)]) self.__add_clients__(new_r, [(node, i)])
prune = self.__remove_clients__(r, [(node, i)], False) prune = self.__remove_clients__(r, [(node, i)], False)
# Precondition: the substitution is semantically valid
# However it may introduce cycles to the graph, in which case the
# transaction will be reverted later.
self.execute_callbacks('on_change_input', node, i, r, new_r, reason=reason) self.execute_callbacks('on_change_input', node, i, r, new_r, reason=reason)
if prune: if prune:
...@@ -438,16 +441,32 @@ class Env(utils.object2): ...@@ -438,16 +441,32 @@ class Env(utils.object2):
if len(self.nodes) < 2: if len(self.nodes) < 2:
# optimization # optimization
# when there are 0 or 1 nodes, no sorting is necessary # when there are 0 or 1 nodes, no sorting is necessary
# This special case happens a lot because the OpWiseCLinker produces
# 1-element graphs.
return list(self.nodes) return list(self.nodes)
env = self env = self
ords = {} ords = self.orderings()
for feature in env._features:
if hasattr(feature, 'orderings'):
for op, prereqs in feature.orderings(env).items():
ords.setdefault(op, []).extend(prereqs)
order = graph.io_toposort(env.inputs, env.outputs, ords) order = graph.io_toposort(env.inputs, env.outputs, ords)
return order return order
def orderings(self):
"""
Return dict d s.t. d[node] is a list of nodes that must be evaluated
before node itself can be evaluated.
This is used primarily by the destroy_handler feature to ensure that all
clients of any destroyed inputs have already computed their outputs.
"""
ords = {}
for feature in self._features:
if hasattr(feature, 'orderings'):
for node, prereqs in feature.orderings(self).items():
ords.setdefault(node, []).extend(prereqs)
# eliminate duplicate prereqs
for (node,prereqs) in ords.items():
ords[node] = list(set(prereqs))
return ords
def nclients(self, r): def nclients(self, r):
"""WRITEME Same as len(self.clients(r)).""" """WRITEME Same as len(self.clients(r))."""
return len(self.clients(r)) return len(self.clients(r))
......
#include <Python.h>
#include "structmember.h"
/**
TODO:
- Check max supported depth of recursion
- CLazyLinker should add context information to errors caught during evaluation. Say what node we were on, add the traceback attached to the node.
- Clear containers of fully-useed intermediate results if allow_gc is 1
- Add timers for profiling
- Add support for profiling space used.
*/
#include <time.h>
static double pytime(const struct timeval * tv)
{
struct timeval t;
if (!tv)
{
tv = &t;
gettimeofday(&t, NULL);
}
return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
}
/**
CLazyLinker
*/
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
PyObject * nodes; // the python list of nodes
PyObject * thunks; // python list of thunks
PyObject * pre_call_clear; //list of cells to clear on call.
int allow_gc;
Py_ssize_t n_applies;
int n_vars; // number of variables in the graph
int * var_computed; // 1 or 0 for every variable
PyObject ** var_computed_cells;
Py_ssize_t n_output_vars;
Py_ssize_t * output_vars; // variables that *must* be evaluated by call
int * is_lazy; // 1 or 0 for every thunk
Py_ssize_t * var_owner; // nodes[[var_owner[var_idx]]] is var[var_idx]->owner
int * var_has_owner; // 1 or 0
Py_ssize_t * node_n_inputs;
Py_ssize_t * node_n_outputs;
Py_ssize_t ** node_inputs;
Py_ssize_t ** node_outputs;
Py_ssize_t * node_inputs_outputs_base; // node_inputs and node_outputs point into this
Py_ssize_t * node_n_prereqs;
Py_ssize_t ** node_prereqs;
void ** thunk_cptr_fn;
void ** thunk_cptr_data;
PyObject * call_times;
PyObject * call_counts;
int do_timing;
int position_of_error; // -1 for no error, otw the index into `thunks` that failed.
} CLazyLinker;
static void
CLazyLinker_dealloc(PyObject* _self)
{
CLazyLinker* self = (CLazyLinker *) _self;
free(self->thunk_cptr_fn);
free(self->thunk_cptr_data);
free(self->is_lazy);
if (self->node_n_prereqs)
{
for (int i = 0; i < self->n_applies; ++i)
{
free(self->node_prereqs[i]);
}
}
free(self->node_n_prereqs);
free(self->node_prereqs);
free(self->node_inputs_outputs_base);
free(self->node_n_inputs);
free(self->node_n_outputs);
free(self->node_inputs);
free(self->node_outputs);
free(self->var_owner);
free(self->var_has_owner);
free(self->var_computed);
if (self->var_computed_cells)
{
for (int i = 0; i < self->n_vars; ++i)
{
Py_DECREF(self->var_computed_cells[i]);
}
}
free(self->var_computed_cells);
free(self->output_vars);
Py_XDECREF(self->nodes);
Py_XDECREF(self->thunks);
Py_XDECREF(self->call_times);
Py_XDECREF(self->call_counts);
Py_XDECREF(self->pre_call_clear);
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
CLazyLinker *self;
self = (CLazyLinker *)type->tp_alloc(type, 0);
if (self != NULL) {
self->nodes = NULL;
self->thunks = NULL;
self->pre_call_clear = NULL;
self->allow_gc = 1;
self->n_applies = 0;
self->n_vars = 0;
self->var_computed = NULL;
self->var_computed_cells = NULL;
self->n_output_vars = 0;
self->output_vars = NULL;
self->is_lazy = NULL;
self->var_owner = NULL;
self->var_has_owner = NULL;
self->node_n_inputs = NULL;
self->node_n_outputs = NULL;
self->node_inputs = NULL;
self->node_outputs = NULL;
self->node_inputs_outputs_base = NULL;
self->node_prereqs = NULL;
self->node_n_prereqs = NULL;
self->thunk_cptr_data = NULL;
self->thunk_cptr_fn = NULL;
self->call_times = NULL;
self->call_counts = NULL;
self->do_timing = 0;
self->position_of_error = -1;
}
return (PyObject *)self;
}
static int
CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
{
static char *kwlist[] = {
(char*)"nodes",
(char*)"thunks",
(char*)"pre_call_clear",
(char*)"allow_gc",
(char*)"call_counts",
(char*)"call_times",
(char*)"compute_map_list",
(char*)"base_input_output_list",
(char*)"node_n_inputs",
(char*)"node_n_outputs",
(char*)"node_input_offset",
(char*)"node_output_offset",
(char*)"var_owner",
(char*)"is_lazy_list",
(char*)"output_vars",
(char*)"node_prereqs",
(char*)"node_output_size",
NULL};
PyObject *compute_map_list=NULL,
*base_input_output_list=NULL,
*node_n_inputs=NULL,
*node_n_outputs=NULL,
*node_input_offset=NULL,
*node_output_offset=NULL,
*var_owner=NULL,
*is_lazy=NULL,
*output_vars=NULL,
*node_prereqs=NULL,
*node_output_size=NULL;
assert(!self->nodes);
if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOO", kwlist,
&self->nodes,
&self->thunks,
&self->pre_call_clear,
&self->allow_gc,
&self->call_counts,
&self->call_times,
&compute_map_list,
&base_input_output_list,
&node_n_inputs,
&node_n_outputs,
&node_input_offset,
&node_output_offset,
&var_owner,
&is_lazy,
&output_vars,
&node_prereqs,
&node_output_size
))
return -1;
Py_INCREF(self->nodes);
Py_INCREF(self->thunks);
Py_INCREF(self->pre_call_clear);
Py_INCREF(self->call_counts);
Py_INCREF(self->call_times);
Py_ssize_t n_applies = PyList_Size(self->nodes);
self->n_applies = n_applies;
self->n_vars = PyList_Size(var_owner);
if (PyList_Size(self->thunks) != n_applies) return -1;
if (PyList_Size(self->call_counts) != n_applies) return -1;
if (PyList_Size(self->call_times) != n_applies) return -1;
// allocated and initialize thunk_cptr_data and thunk_cptr_fn
if (n_applies)
{
self->thunk_cptr_data = (void**)malloc(n_applies * sizeof(void*));
self->thunk_cptr_fn = (void**)malloc(n_applies * sizeof(void*));
self->is_lazy = (int*)malloc(n_applies * sizeof(int));
self->node_prereqs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
self->node_n_prereqs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_prereqs);
assert(self->node_n_prereqs);
assert(self->is_lazy);
assert(self->thunk_cptr_fn);
assert(self->thunk_cptr_data);
// init these basic arrays
for (int i = 0; i < n_applies; ++i)
{
self->thunk_cptr_data[i] = NULL;
self->thunk_cptr_fn[i] = NULL;
self->is_lazy[i] = 1;
self->node_prereqs[i] = NULL;
self->node_n_prereqs[i] = 0;
}
for (int i = 0; i < n_applies; ++i)
{
PyObject * thunk = PyList_GetItem(self->thunks, i);
//thunk is borrowed
if (PyObject_HasAttrString(thunk, "cthunk"))
{
PyObject * cthunk = PyObject_GetAttrString(thunk, "cthunk");
//new reference
assert (cthunk && PyCObject_Check(cthunk));
self->thunk_cptr_fn[i] = PyCObject_AsVoidPtr(cthunk);
self->thunk_cptr_data[i] = PyCObject_GetDesc(cthunk);
Py_DECREF(cthunk);
// cthunk is kept alive by membership in self->thunks
}
else
{
self->thunk_cptr_fn[i] = NULL;
self->thunk_cptr_data[i] = NULL;
}
PyObject * el_i = PyList_GetItem(is_lazy, i);
self->is_lazy[i] = PyNumber_AsSsize_t(el_i, NULL);
/* now get the prereqs */
el_i = PyList_GetItem(node_prereqs, i);
assert (PyList_Check(el_i));
self->node_n_prereqs[i] = PyList_Size(el_i);
if (self->node_n_prereqs[i])
{
self->node_prereqs[i] = (Py_ssize_t*)malloc(
PyList_Size(el_i)*sizeof(Py_ssize_t));
for (int j = 0; j < PyList_Size(el_i); ++j)
{
PyObject * el_ij = PyList_GetItem(el_i, j);
Py_ssize_t N = PyNumber_AsSsize_t(el_ij, PyExc_IndexError);
if (PyErr_Occurred())
return -1;
// N < n. variables
assert(N < PyList_Size(var_owner));
self->node_prereqs[i][j] = N;
}
}
}
}
if (PyList_Check(base_input_output_list))
{
Py_ssize_t n_inputs_outputs_base = PyList_Size(base_input_output_list);
self->node_inputs_outputs_base = (Py_ssize_t*)malloc(n_inputs_outputs_base*sizeof(Py_ssize_t));
assert(self->node_inputs_outputs_base);
for (int i = 0; i < n_inputs_outputs_base; ++i)
{
PyObject *el_i = PyList_GetItem(base_input_output_list, i);
Py_ssize_t idx = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
self->node_inputs_outputs_base[i] = idx;
}
self->node_n_inputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_n_inputs);
self->node_n_outputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_n_outputs);
self->node_inputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
assert(self->node_inputs);
self->node_outputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
assert(self->node_outputs);
for (int i = 0; i < n_applies; ++i)
{
Py_ssize_t N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_n_inputs, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_n_inputs[i] = N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_n_outputs, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_n_outputs[i] = N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_input_offset, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_inputs[i] = &self->node_inputs_outputs_base[N];
N = PyNumber_AsSsize_t(PyList_GetItem(node_output_offset, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_outputs[i] = &self->node_inputs_outputs_base[N];
}
}
else
{
PyErr_SetString(PyExc_TypeError, "base_input_output_list must be list");
return -1;
}
// allocation for var_owner
if (PyList_Check(var_owner))
{
self->var_owner = (Py_ssize_t*)malloc(self->n_vars*sizeof(Py_ssize_t));
self->var_has_owner = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
for (int i = 0; i < self->n_vars; ++i)
{
PyObject * el_i = PyList_GetItem(var_owner, i);
if (el_i == Py_None)
{
self->var_has_owner[i] = 0;
}
else
{
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_applies);
self->var_owner[i] = N;
self->var_has_owner[i] = 1;
}
self->var_computed_cells[i] = PyList_GetItem(compute_map_list, i);
Py_INCREF(self->var_computed_cells[i]);
}
}
else
{
PyErr_SetString(PyExc_TypeError, "var_owner must be list");
return -1;
}
//output vars
if (PyList_Check(output_vars))
{
self->n_output_vars = PyList_Size(output_vars);
self->output_vars = (Py_ssize_t*)malloc(self->n_output_vars*sizeof(Py_ssize_t));
assert(self->output_vars);
for (int i = 0; i < self->n_output_vars; ++i)
{
PyObject * el_i = PyList_GetItem(output_vars, i);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= self->n_vars);
self->output_vars[i] = N;
}
}
else
{
PyErr_SetString(PyExc_TypeError, "output_vars must be list");
return -1;
}
return 0;
}
static void set_position_of_error(CLazyLinker * self, int owner_idx)
{
if (self->position_of_error == -1)
{
self->position_of_error = owner_idx;
}
}
static PyObject * pycall(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
{
// call thunk to see which inputs it wants
PyObject * thunk = PyList_GetItem(self->thunks, node_idx);
// refcounting - thunk is borrowed
PyObject * rval = NULL;
if (self->do_timing)
{
double t0 = pytime(NULL);
if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
rval = PyObject_CallObject(thunk, NULL);
double t1 = pytime(NULL);
double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
PyObject * count = PyList_GetItem(self->call_counts, node_idx);
long icount = PyInt_AsLong(count);
PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
}
else
{
if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
rval = PyObject_CallObject(thunk, NULL);
}
return rval;
}
static int c_call(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
{
void * ptr_addr = self->thunk_cptr_fn[node_idx];
int (*fn)(void*) = (int (*)(void*))(ptr_addr);
if (verbose) fprintf(stderr, "calling non-lazy shortcut (node %i)\n", (int)node_idx);
int err = 0;
if (self->do_timing)
{
double t0 = pytime(NULL);
err = fn(self->thunk_cptr_data[node_idx]);
double t1 = pytime(NULL);
double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
PyObject * count = PyList_GetItem(self->call_counts, node_idx);
long icount = PyInt_AsLong(count);
PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
}
else
{
err = fn(self->thunk_cptr_data[node_idx]);
}
if (err)
{
// cast the argument to a PyList (as described near line 226 of cc.py)
PyObject * __ERROR = ((PyObject**)self->thunk_cptr_data[node_idx])[0];
assert (PyList_Check(__ERROR));
assert (PyList_Size(__ERROR) == 3);
PyObject * err_type = PyList_GetItem(__ERROR, 0); //stolen ref
PyObject * err_msg = PyList_GetItem(__ERROR, 1); //stolen ref
PyObject * err_trace = PyList_GetItem(__ERROR, 2); //stolen ref
PyList_SET_ITEM(__ERROR, 0, Py_None); Py_INCREF(Py_None); //clobbers old ref
PyList_SET_ITEM(__ERROR, 1, Py_None); Py_INCREF(Py_None); //clobbers old ref
PyList_SET_ITEM(__ERROR, 2, Py_None); Py_INCREF(Py_None); //clobbers old ref
assert(!PyErr_Occurred()); // because CLinker hid the exception in __ERROR aka data
PyErr_Restore(err_type, err_msg, err_trace); //steals refs to args
}
if (err) set_position_of_error(self, node_idx);
return err;
}
static
int lazy_rec_eval(CLazyLinker * self, Py_ssize_t var_idx, PyObject*one, PyObject*zero)
{
int verbose = 0;
if (verbose) fprintf(stderr, "lazy_rec computing %i\n", (int)var_idx);
int err = 0;
if (self->var_computed[var_idx] || !self->var_has_owner[var_idx])
{
return 0;
}
else
{
Py_ssize_t owner_idx = self->var_owner[var_idx];
// STEP 1: compute the pre-requirements of the node
for (int i = 0; i < self->node_n_prereqs[owner_idx]; ++i)
{
Py_ssize_t prereq_idx = self->node_prereqs[owner_idx][i];
if (!self->var_computed[prereq_idx])
{
err = lazy_rec_eval(self, prereq_idx, one, zero);
if (err) return err;
}
assert (self->var_computed[prereq_idx]);
}
// STEP 2: compute the node itself
if (self->is_lazy[owner_idx])
{
// update the compute_map cells corresponding to the inputs of this thunk
for (int i = 0; i < self->node_n_inputs[owner_idx] && (!err); ++i)
{
int in_idx = self->node_inputs[owner_idx][i];
if (self->var_computed[in_idx])
{
Py_INCREF(one);
err = PyList_SetItem(self->var_computed_cells[in_idx], 0, one);
}
else
{
Py_INCREF(zero);
err = PyList_SetItem(self->var_computed_cells[in_idx], 0, zero);
}
}
if (err)
{
set_position_of_error(self, owner_idx);
return err;
}
PyObject * rval = pycall(self, owner_idx, verbose);
// refcounting - rval is new ref
//TODO: to prevent infinite loops
// - consider check that a thunk does not ask for an input that is already computed
if (rval) //call returned normally (no exception)
{
//update the computed-ness of any output cells
for (int i = 0; i < self->node_n_outputs[owner_idx]; ++i)
{
int out_idx = self->node_outputs[owner_idx][i];
PyObject * el_i = PyList_GetItem(self->var_computed_cells[out_idx], 0);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred())
{
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return -1;
}
assert (N==0 || N==1);
self->var_computed[out_idx] = N;
}
if (!self->var_computed[var_idx])
{
if (PyList_Check(rval))
{
if (PyList_Size(rval))
{
for (int i = 0; i < PyList_Size(rval) && (!err); ++i)
{
PyObject * el_i = PyList_GetItem(rval, i);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred())
{
err = 1;
}
else
{
assert (N <= self->node_n_inputs[owner_idx]);
Py_ssize_t input_idx = self->node_inputs[owner_idx][N];
err = lazy_rec_eval(self, input_idx, one, zero);
}
}
if (!err)
err = lazy_rec_eval(self, var_idx, one, zero);
}
else
{
PyErr_SetString(PyExc_ValueError,
"lazy thunk returned empty list without computing output");
err = 1;
set_position_of_error(self, owner_idx);
}
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return err;
}
else // don't know what it returned, but it wasn't right.
{
//TODO: More helpful error to help find *which node* made this
// bad thunk
PyErr_SetString(PyExc_TypeError,
"lazy thunk should list");
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return 1;
}
}
Py_DECREF(rval);
}
else // pycall returned NULL (internal error)
{
assert (PyErr_Occurred());
set_position_of_error(self, owner_idx);
return 1;
}
}
else //owner is not a lazy op. Ensure all intputs are evaluated.
{
// loop over inputs to owner
// call lazy_rec_eval on each one that is not computed.
// if there's an error, pass it up the stack
for (int i = 0; i < self->node_n_inputs[owner_idx]; ++i)
{
Py_ssize_t input_idx = self->node_inputs[owner_idx][i];
if (!self->var_computed[input_idx])
{
err = lazy_rec_eval(self, input_idx, one, zero);
if (err) return err;
}
assert (self->var_computed[input_idx]);
}
// call the thunk for this owner.
if (self->thunk_cptr_fn[owner_idx])
{
err = c_call(self, owner_idx, verbose);
}
else
{
PyObject * rval = pycall(self, owner_idx, verbose);
//rval is new ref
if (rval) //pycall returned normally (no exception)
{
if (rval == Py_None)
{
Py_DECREF(rval); //ignore a return of None
}
else if (PyList_Check(rval))
{
PyErr_SetString(PyExc_TypeError,
"non-lazy thunk should return None, not list");
err=1;
set_position_of_error(self, owner_idx);
Py_DECREF(rval);
}
else // don't know what it returned, but it wasn't right.
{
PyErr_SetObject(PyExc_TypeError, rval);
err=1;
set_position_of_error(self, owner_idx);
}
}
else // pycall returned NULL (internal error)
{
err=1;
set_position_of_error(self, owner_idx);
}
}
}
// loop over all outputs and mark them as computed
for (int i = 0; i < self->node_n_outputs[owner_idx] && (!err); ++i)
{
self->var_computed[self->node_outputs[owner_idx][i]] = 1;
}
}
return err;
}
PyObject *
CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
{
CLazyLinker * self = (CLazyLinker*)_self;
static char *kwlist[] = {(char*)"time_thunks", NULL};
if (! PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist,
&self->do_timing))
return NULL;
int err = 0;
self->position_of_error = -1;
PyObject * one = PyInt_FromLong(1);
PyObject * zero = PyInt_FromLong(0);
//clear storage of pre_call_clear elements
Py_ssize_t n_pre_call_clear = PyList_Size(self->pre_call_clear);
assert(PyList_Check(self->pre_call_clear));
for (int i = 0; i < n_pre_call_clear; ++i)
{
PyObject * el_i = PyList_GetItem(self->pre_call_clear, i);
Py_INCREF(Py_None);
PyList_SetItem(el_i, 0, Py_None);
}
//clear the computed flag out of all non-input vars
for (int i = 0; i < self->n_vars; ++i)
{
self->var_computed[i] = !self->var_has_owner[i];
if (self->var_computed[i])
{
Py_INCREF(one);
PyList_SetItem(self->var_computed_cells[i], 0, one);
}
else
{
Py_INCREF(zero);
PyList_SetItem(self->var_computed_cells[i], 0, zero);
}
}
for (int i = 0; i < self->n_output_vars && (!err); ++i)
{
err = lazy_rec_eval(self, self->output_vars[i], one, zero);
}
Py_DECREF(one);
Py_DECREF(zero);
if (err) return NULL;
Py_INCREF(Py_None);
return Py_None;
}
#if 0
static PyMethodDef CLazyLinker_methods[] = {
{
//"name", (PyCFunction)CLazyLinker_accept, METH_VARARGS, "Return the name, combining the first and last name"
},
{NULL} /* Sentinel */
};
#endif
static PyMemberDef CLazyLinker_members[] = {
{(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,
(char*)"list of nodes"},
{(char*)"thunks", T_OBJECT_EX, offsetof(CLazyLinker, thunks), 0,
(char*)"list of thunks in program"},
{(char*)"call_counts", T_OBJECT_EX, offsetof(CLazyLinker, call_counts), 0,
(char*)"number of calls of each thunk"},
{(char*)"call_times", T_OBJECT_EX, offsetof(CLazyLinker, call_times), 0,
(char*)"total runtime in each thunk"},
{(char*)"position_of_error", T_INT, offsetof(CLazyLinker, position_of_error), 0,
(char*)"position of failed thunk"},
{(char*)"time_thunks", T_INT, offsetof(CLazyLinker, do_timing), 0,
(char*)"bool: nonzero means call will time thunks"},
{NULL} /* Sentinel */
};
static PyTypeObject lazylinker_ext_CLazyLinkerType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"lazylinker_ext.CLazyLinker", /*tp_name*/
sizeof(CLazyLinker), /*tp_basicsize*/
0, /*tp_itemsize*/
CLazyLinker_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
CLazyLinker_call, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
"CLazyLinker object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
0,//CLazyLinker_methods, /* tp_methods */
CLazyLinker_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)CLazyLinker_init,/* tp_init */
0, /* tp_alloc */
CLazyLinker_new, /* tp_new */
};
static PyMethodDef lazylinker_ext_methods[] = {
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
PyMODINIT_FUNC
initlazylinker_ext(void)
{
PyObject* m;
lazylinker_ext_CLazyLinkerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&lazylinker_ext_CLazyLinkerType) < 0)
return;
m = Py_InitModule3("lazylinker_ext", lazylinker_ext_methods,
"Example module that creates an extension type.");
Py_INCREF(&lazylinker_ext_CLazyLinkerType);
PyModule_AddObject(m, "CLazyLinker", (PyObject *)&lazylinker_ext_CLazyLinkerType);
}
import os
import theano
from theano import config
from theano.gof.compilelock import get_lock, release_lock
from theano.gof import cmodule
get_lock()
try:
dirname = 'lazylinker_ext'
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
code = open(cfile).read()
loc = os.path.join(config.compiledir, dirname)
if not os.path.exists(loc):
os.mkdir(loc)
cmodule.gcc_module_compile_str(dirname, code, location=loc)
from lazylinker_ext.lazylinker_ext import *
finally:
# Release lock on compilation directory.
release_lock()
...@@ -3,18 +3,21 @@ ...@@ -3,18 +3,21 @@
The `Op` class is the base interface for all operations The `Op` class is the base interface for all operations
compatible with `gof`'s :doc:`graph` routines. compatible with `gof`'s :doc:`graph` routines.
""" """
__authors__ = "theano-dev"
__copyright__ = "(c) 2010, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
import logging
from theano import config from theano import config
import graph import graph
import numpy import numpy
import utils import utils
import warnings import warnings
import logging
from theano import config
from env import Env from env import Env
import graph
import cc import cc
......
from copy import deepcopy
import numpy
from theano.gof.op import PureOp
from theano.gof import Apply, generic, Container
from theano.gof.link import LocalLinker, map_storage, add_clear_storage
from theano import function, Mode
from theano.lazycond import ifelse
import theano.tensor as T
class IfElseIfElseIf(PureOp):
def __init__(self, inplace=False):
self.inplace=inplace # check destroyhandler and others to ensure that a view_map with
#multiple inputs can work
assert not self.inplace
def make_node(self, c1, t1, c2,t2,c3,t3,f3):
assert t1.type == f3.type
assert t2.type == t3.type
assert t3.type == f3.type
return Apply(self, [c1,t1,c2,t2,c3,t3,f3], [t1.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
input_computed = [compute_map[v] for v in node.inputs]
output_computed = [compute_map[v] for v in node.outputs]
input_registers = [storage_map[v] for v in node.inputs]
output_registers = [storage_map[v] for v in node.outputs]
outtype = node.outputs[0].type
def thunk():
if not input_computed[0][0]:
return [0]
else:
truthval = input_registers[0][0]
if truthval:
if not input_computed[1][0]:
return [1]
else:
output_computed[0][0]=1
output_registers[0][0]=outtype.filter(deepcopy(input_registers[1][0]))
return []
else:
if not input_computed[2][0]:
return [2]
else:
truthval = input_registers[2][0]
if truthval:
if not input_computed[3][0]:
return [3]
else:
output_computed[0][0] = 1
output_registers[0][0] = outtype.filter(deepcopy(input_registers[3][0]))
return []
else:
if not input_computed[4][0]:
return [4]
else:
truthval = input_registers[4][0]
if truthval:
if not input_computed[5][0]:
return [5]
else:
output_computed[0][0] = 1
output_registers[0][0] = outtype.filter(deepcopy(input_registers[5][0]))
return []
else:
if not input_computed[6][0]:
return [6]
else:
output_computed[0][0] = 1
output_registers[0][0] = outtype.filter(deepcopy(input_registers[6][0]))
return []
thunk.lazy = True
return thunk
class NotImplementedOp(PureOp):
class E(Exception): pass
def make_node(self, x):
return Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
def thunk():
raise self.E()
thunk.lazy=False
return thunk
def test_ifelse():
a = generic()
b = generic()
c = generic()
notimpl = NotImplementedOp()
f = function([a,b,c], ifelse(a, notimpl(b), c),
mode=Mode(linker='vm', optimizer='fast_run'))
try:
print "case 1"
f( True, 'a', 'b')
assert False
except NotImplementedOp.E:
pass
print "... passed"
print "case 2"
print f( False, 'a', 'b')
assert f( False, 'a', 'b') == 'b'
print "... passed"
def more_complex_test():
notimpl = NotImplementedOp()
ifelseifelseif = IfElseIfElseIf()
x1 = T.scalar('x1')
x2 = T.scalar('x2')
c1 = generic('c1')
c2 = generic('c2')
t1 = ifelse(c1,x1,notimpl(x2))
t1.name = 't1'
t2 = t1*10
t2.name = 't2'
t3 = ifelse(c2,t2, x1+t1)
t3.name = 't3'
t4 = ifelseifelseif(T.eq(x1,x2), x1, T.eq(x1,5), x2, c2, t3, t3+0.5)
t4.name = 't4'
f = function([c1,c2,x1,x2], t4, mode=Mode(linker='vm', optimizer='fast_run'))
print f(1, 0, numpy.array(10,dtype=x1.dtype),0)
assert f(1,0,numpy.array(10,dtype=x1.dtype),0) == 20.5
print '... passed'
if __name__ == '__main__':
more_complex_test()
import gc
import sys
import time
try:
import line_profiler
except ImportError:
pass
import numpy
from theano import function
from theano.gof import vm,link, OpWiseCLinker
from theano.compile import Mode
from theano import tensor
from theano.lazycond import ifelse
import theano
def test_speed():
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = (z + z)
return z
def numpy_version(x, depth):
z = x
for d in xrange(depth):
z = (z+z)
return z
def time_numpy():
steps_a = 5
steps_b = 100
x = numpy.asarray([2.0, 3.0], dtype=theano.config.floatX)
numpy_version(x, steps_a)
t0 = time.time()
print numpy_version(x, steps_a)
t1 = time.time()
t2 = time.time()
print numpy_version(x, steps_b)
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
'numpy',
(1000*(t_b-t_a) / (steps_b - steps_a)))
def time_linker(name, linker):
steps_a = 5
steps_b = 100
x = tensor.vector()
a = build_graph(x,steps_a)
b = build_graph(x,steps_b)
f_a = function([x], a,
mode=Mode(optimizer=None, linker=linker()),
#profile='f_a speed test %s'%name,
)
f_b = function([x], b,
mode=Mode(optimizer=None, linker=linker()),
#profile='f_b speed test %s'%name,
)
print f_a([2.0, 3.0])
t0 = time.time()
print f_a([2.0, 3.0])
t1 = time.time()
print f_b([2.0, 3.0])
t2 = time.time()
print f_b([2.0, 3.0])
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
name,
(1000*(t_b-t_a) / (steps_b - steps_a)))
time_linker('c|py', OpWiseCLinker)
time_linker('vmLinker', vm.VM_Linker)
time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
use_cloop=True))
time_numpy()
def test_speed_lazy():
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = ifelse(z> 0, -z, z)
return z
def time_linker(name, linker):
steps_a = 10
steps_b = 100
x = tensor.vector()
a = build_graph(x, steps_a)
b = build_graph(x, steps_b)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()),
#profile='f_a lazy ifelse %s'%name,
)
f_b = function([x], b,
mode=Mode(optimizer=None,
linker=linker()),
#profile='f_b lazy ifelse %s'%name,
)
print f_a([2.0])
t0 = time.time()
print f_a([2.0])
t1 = time.time()
print f_b([2.0])
t2 = time.time()
print f_b([2.0])
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
name,
(1000*(t_b-t_a) / (steps_b - steps_a)))
time_linker('vmLinker', vm.VM_Linker)
time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
use_cloop=True))
run_memory_usage_tests = False
if run_memory_usage_tests:
# these are not normal unit tests, do not run them as part of standard
# suite. I ran them while looking at top, and stopped when memory usage was
# stable.
def test_leak2():
import theano.sandbox.cuda as cuda
for i in xrange(1000000):
n = numpy.asarray([2.3, 4.5], dtype='f')
c = sys.getrefcount(n)
a = cuda.CudaNdarray(n)
assert c == sys.getrefcount(n)
if not i % 1000:
print '.',
print gc.collect(),
print gc.collect()
sys.stdout.flush()
def test_no_leak_many_graphs():
# Verify no memory leaks when creating and deleting a lot of functions
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
for i in xrange(10000):
x = tensor.vector()
z = x
for d in range(10):
z = tensor.sin(-z+ 1)
f = function([x], z, mode=Mode(optimizer=None, linker='cvm'))
if not i % 100:
print gc.collect()
sys.stdout.flush()
gc.collect()
if 1:
f([2.0])
f([3.0])
f([4.0])
f([5.0])
def test_no_leak_many_call_lazy():
# Verify no memory leaks when calling a function a lot of times
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = ifelse(z> 0, -z, z)
return z
def time_linker(name, linker):
steps_a = 10
x = tensor.vector()
a = build_graph(x, steps_a)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()))
for i in xrange(100000):
f_a([2.0])
if 0: # this doesn't seem to work, prints 0 for everything
import resource
pre = resource.getrusage(resource.RUSAGE_SELF)
post = resource.getrusage(resource.RUSAGE_SELF)
print pre.ru_ixrss, post.ru_ixrss
print pre.ru_idrss, post.ru_idrss
print pre.ru_maxrss, post.ru_maxrss
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
def test_no_leak_many_call_nonlazy():
# Verify no memory leaks when calling a function a lot of times
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = tensor.sin(-z+1)
return z
def time_linker(name, linker):
steps_a = 10
x = tensor.vector()
a = build_graph(x,steps_a)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()))
for i in xrange(500000):
f_a([2.0])
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
"""
VMs that run Theano graph computations.
"""
import sys
import time
import link
import traceback
from theano.gof.python25 import all
import theano
config = theano.config
from theano.configparser import config, AddConfigVar, BoolParam
from theano import config
AddConfigVar('profile',
"If VM should collect profile information",
BoolParam(False))
def raise_with_op(op, exc_info = None):
"""WRITEME"""
if exc_info is None:
exc_info = sys.exc_info()
exc_type, exc_value, exc_trace = exc_info
if exc_type == KeyboardInterrupt:
# print a simple traceback from KeyboardInterrupt
raise exc_type, exc_value, exc_trace
try:
trace = op.tag.trace
except AttributeError:
trace = ()
exc_value.__thunk_trace__ = trace
exc_value.args += (op, )
if op in op.env.toposort():
exc_value.args += ('Sequence id of Apply node='+str(op.env.toposort().index(op)),)
raise exc_type, exc_value, exc_trace
class VM(object):
"""
A VM object evaluates a Theano program with its __call__ method.
Attributes:
call_counts - list of integers, one for each thunk. call_count[i] is the
number of times thunks[i] was called in the course of computations
performed by call_with_timers().
call_times - list of floats, one for each thunk. call_times[i] is the amount
of runtime spent on thunks[i] in the course of computations performed by
call_with_timers().
"""
def __init__(self, nodes, thunks, pre_call_clear):
"""
Allocate a virtual machine.
nodes - a list of nodes in toposort order
thunks - a list of thunks to execute those nodes, in toposort order
pre_call_clear - a list of containers to empty at the beginning of each
call.
"""
if len(nodes) != len(thunks):
raise ValueError()
self.nodes = nodes
self.thunks = thunks
self.pre_call_clear = pre_call_clear
self.call_counts = [0]*len(nodes)
self.call_times = [0]*len(nodes)
self.time_thunks = False
def __call__(self):
"""
Run the machine.
Postcondition - all output variables have been computed. VMs vary in
what exactly this means and how it is done.
"""
raise NotImplementedError('override me')
def clear_storage(self):
"""
Free any internal references to temporary variables.
Free internal variables and outputs. Essentially, free as much memory
as possible without intefering with the ability to evaluate subsequent
calls.
"""
raise NotImplementedError('override me')
def update_profile(self, profile):
# accumulate into the profile object
for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts):
profile.apply_time.setdefault(node,0.0)
profile.apply_time[node] += t
profile.apply_callcount.setdefault(node,0)
profile.apply_callcount[node] = c
profile.apply_cimpl[node] = hasattr(thunk,'cthunk')
# clear the timer info out of the buffers
for i in range(len(self.call_times)):
self.call_times[i] = 0.0
self.call_counts[i] = 0
class Loop(VM):
"""
Unconditional start-to-finish program execution in Python.
No garbage collection is allowed on intermediate results.
"""
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
cont[0] = None
try:
for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)):
t0 = time.time()
thunk()
t1 = time.time()
self.call_counts[i] += 1
self.call_times[i] += t1 - t0
except:
raise_with_op(node)
else:
for cont in self.pre_call_clear:
cont[0] = None
try:
for thunk, node in zip(self.thunks, self.nodes):
thunk()
except:
raise_with_op(node)
class LoopGC(VM):
"""
Unconditional start-to-finish program execution in Python.
Garbage collection is possible on intermediate results.
"""
def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
self.post_thunk_clear = post_thunk_clear
if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
raise ValueError()
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
cont[0] = None
try:
i = 0
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
t0 = time.time()
thunk()
t1 = time.time()
self.call_counts[i] += 1
self.call_times[i] += t1 - t0
for old_s in old_storage:
old_s[0] = None
i += 1
except:
raise_with_op(node)
else:
for cont in self.pre_call_clear:
cont[0] = None
try:
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
thunk()
for old_s in old_storage:
old_s[0] = None
except:
raise_with_op(node)
class Stack(VM):
"""
Finish-to-start evalution order of thunks.
This supports lazy evaluation of subtrees and partial
computations of graphs when only some inputs have changed.
"""
def __init__(self, nodes, thunks, pre_call_clear,
storage_map, compute_map,
env, allow_gc):
super(Stack, self).__init__(nodes, thunks, pre_call_clear)
self.allow_gc = allow_gc
self.message = ""
self.base_apply_stack = [o.owner for o in env.outputs if o.owner]
self.outputs = env.outputs
self.storage_map = storage_map
self.apply_time = {}
self.outputs_size = {}
self.compute_map = compute_map
self.node_idx = node_idx = {}
ords = env.orderings()
for i, node in enumerate(self.nodes):
node_idx[node] = i
self.apply_time[node] = 0
self.outputs_size[node] = []
node.destroy_dependencies = []
if node in ords:
for prereq in ords[node]:
node.destroy_dependencies += prereq.outputs
dependencies = self.dependencies = {}
for k in storage_map:
dependencies[k] = []
if k.owner and k.clients:
ls = []
is_output = 0
for cl in k.clients:
if cl[0] is not 'output':
ls += cl[0].outputs
dependencies[k] += ls
if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16}
atexit.register(self.atexit_print_all)
def __call__(self):
storage_map = self.storage_map
compute_map = self.compute_map
thunks = self.thunks
dependencies = self.dependencies
for k in self.storage_map:
compute_map[k][0] = (k.owner is None)
# apply_stack contains nodes
apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1
ls = []
while apply_stack:
# Make sure something happened last time round.
# This is just a safety check to make sure the op is written correctly
# apply_stack should either decrease in length by one (a thunk successfully applied), or
# increase in length (added dependencies over and above the original).
# NB: this doesn't catch cycles (would be too expensive/slow), just stalls.
apply_stack_len = len(apply_stack)
assert apply_stack_len != last_apply_stack_len
last_apply_stack_len = apply_stack_len
current_apply = apply_stack.pop()
# Use these for loops + breaks to short circuit evaluation
# This is a significant performance point
computed_ins = True
for i in current_apply.inputs:
if not compute_map[i][0]:
computed_ins = False
break
computed_outs = True
for o in current_apply.outputs:
if not compute_map[o][0]:
computed_outs = False
break
if computed_ins:
for d in current_apply.destroy_dependencies:
if not compute_map[d][0]:
computed_ins = False
break
if not thunks[self.node_idx[current_apply]].lazy:
# Check if all inputs are in place
# If so compute thunk and remove it from the apply_stack
# If not leave it in, and add to the apply_stack those that will
# produce you those inputs
if computed_ins and not computed_outs:
try:
t0 = time.time()
thunks[self.node_idx[current_apply]]()
if config.profile:
dt = time.time() - t0
self.apply_time[current_apply] += dt
## Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
size = []
for (idx,o) in enumerate(
thunks[self.node_idx[current_apply]].outputs):
if not hasattr(o[0],'size'):
size.append(-1)
continue
s=o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-3:]
s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
size.append(s)
self.outputs_size[current_apply] = size
except Exception:
raise_with_op(current_apply)
for o in current_apply.outputs:
compute_map[o][0] = 1
# Garbage Collection -> check if anybody else uses this input
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner
and i not in self.outputs):
empty_storage_map = True
for x in dependencies[i]:
if not compute_map[x][0]:
empty_storage_map = False
break
if empty_storage_map:
storage_map[i][0] = None
elif not computed_ins:
apply_stack.append(current_apply)
apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner)
apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner)
elif not computed_outs:
# Try and run it to see if it works
try:
t0 = time.time()
requires = thunks[self.node_idx[current_apply]]()
dt = time.time() - t0
self.apply_time[current_apply] += dt
except Exception:
raise_with_op(current_apply)
if requires:
for r in requires:
# We are not done with this op ..
# so we added back and see to get the inputs we are missing
apply_stack.append(current_apply)
if current_apply.inputs[r].owner:
apply_stack.append(current_apply.inputs[r].owner)
else:
if config.profile:
size = []
for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs):
if not hasattr(o[0],'size'):
size.append(-1)
continue
s=o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
size.append(s)
self.outputs_size[current_apply] = size
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner and
i not in self.outputs):
empty_storage_map = True
for x in dependencies[i]:
if not compute_map[x][0]:
empty_storage_map = False
break
if empty_storage_map:
storage_map[i][0] = None
try:
import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
# skip VM.__init__
except ImportError:
pass
class VM_Linker(link.LocalLinker):
"""
Class that satisfies the Linker interface by acting as a VM factory.
"""
def __init__(self, allow_gc=True, use_cloop = False):
self.env = None
self.allow_gc = allow_gc
self.use_cloop=use_cloop
def accept(self, env, no_recycling = []):
"""
:param env: a PerformLinker can have accepted one Env instance at a time.
:param no_recycling: WRITEME
:returns: self (TODO: WHY? Who calls this function?)
"""
if self.env is not None and self.env is not env:
return type(self)().accept(env, no_recycling)
self.env = env
self.no_recycling = no_recycling
return self
def make_vm(self, nodes, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
if self.use_cloop:
# create a map from nodes to ints and vars to ints
nodes_idx = {}
vars_idx = {}
for i, node in enumerate(nodes):
nodes_idx[node] = i
for v in node.inputs + node.outputs:
vars_idx.setdefault(v, len(vars_idx))
for v in self.env.inputs + self.env.outputs:
vars_idx.setdefault(v, len(vars_idx))
nodes_idx_inv = {}
vars_idx_inv = {}
for (node,i) in nodes_idx.items():
nodes_idx_inv[i] = node
for (var,i) in vars_idx.items():
vars_idx_inv[i] = var
# put storage_map and compute_map into a int-based scheme
n_applies = len(nodes)
storage_map_list = [storage_map[vars_idx_inv[i]]
for i in range(len(vars_idx_inv))]
compute_map_list = [compute_map[vars_idx_inv[i]]
for i in range(len(vars_idx_inv))]
if nodes:
assert type(storage_map_list[0]) is list
assert type(compute_map_list[0]) is list
# build the pointers to node inputs and offsets
base_input_output_list = []
node_n_inputs = []
node_n_outputs = []
node_input_offset = []
node_output_offset = []
for node in nodes:
inputs_idx = [vars_idx[v] for v in node.inputs]
outputs_idx = [vars_idx[v] for v in node.outputs]
node_n_inputs.append(len(inputs_idx))
node_n_outputs.append(len(outputs_idx))
node_input_offset.append(len(base_input_output_list))
base_input_output_list.extend(inputs_idx)
node_output_offset.append(len(base_input_output_list))
base_input_output_list.extend(outputs_idx)
# build the var owner array
var_owner = [None]*len(vars_idx)
for (var,i) in vars_idx.items():
if var.owner:
var_owner[i] = nodes_idx[var.owner]
is_lazy_list = [int(th.lazy) for th in thunks]
output_vars = [vars_idx[v] for v in self.env.outputs]
# builds the list of prereqs induced by e.g. destroy_handler
ords = self.env.orderings()
node_prereqs = []
node_output_size = []
for i, node in enumerate(nodes):
node_output_size.append(0)
prereq_var_idxs = []
for prereq_node in ords.get(node,[]):
prereq_var_idxs.extend(
[vars_idx[v] for v in prereq_node.outputs])
prereq_var_idxs = list(set(prereq_var_idxs))
prereq_var_idxs.sort() # TODO: why sort?
node_prereqs.append(prereq_var_idxs)
c0 = sys.getrefcount(node_n_inputs)
vm = CVM(
nodes,
thunks,
pre_call_clear,
allow_gc=self.allow_gc,
call_counts=[0]*len(nodes),
call_times=[0.0]*len(nodes),
compute_map_list=compute_map_list,
base_input_output_list=base_input_output_list,
node_n_inputs=node_n_inputs,
node_n_outputs=node_n_outputs,
node_input_offset=node_input_offset,
node_output_offset=node_output_offset,
var_owner=var_owner,
is_lazy_list=is_lazy_list,
output_vars=output_vars,
node_prereqs=node_prereqs,
node_output_size=node_output_size,
)
assert c0 == sys.getrefcount(node_n_inputs)
else:
if all([(not th.lazy) for th in thunks]):
# there is no conditional in the graph
if self.allow_gc:
vm = LoopGC(
nodes,
thunks,
pre_call_clear,
post_thunk_clear)
else:
vm = Loop(
nodes,
thunks,
pre_call_clear)
else:
vm = Stack(
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.env, self.allow_gc
)
return vm
def make_all(self, profiler = None, input_storage = None, output_storage = None):
env = self.env
order = list(env.toposort())
no_recycling = self.no_recycling
input_storage, output_storage, storage_map = link.map_storage(
env, order, input_storage, output_storage)
compute_map = {}
for k in storage_map:
compute_map[k] = [k.owner is None]
thunks = [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)
for node in order]
computed, last_user = link.gc_helper(order)
if self.allow_gc:
post_thunk_clear = []
for node in order:
clear_after_this_thunk = []
for input in node.inputs:
if ((input in computed)
and (input not in env.outputs)
and (node == last_user[input])):
clear_after_this_thunk.append(storage_map[input])
post_thunk_clear.append(clear_after_this_thunk)
else:
post_thunk_clear = None
vm = self.make_vm(order, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
)
return (vm,
[link.Container(input, storage)
for input, storage in zip(env.inputs, input_storage)],
[link.Container(output, storage, True)
for output, storage in zip(env.outputs, output_storage)],
thunks,
order)
"""
IfElse is an Op that works with the LazyLinker to support conditional graph evaluation.
:TODO: Add text to library documentation describing the IfElse Op.
"""
from copy import deepcopy
from theano.gof import PureOp, Apply, generic, Container
import theano.tensor
import gof
from compile import optdb
from tensor import opt
@gof.local_optimizer([None])
def ifelse_make_inplace(node):
op = node.op
if isinstance(op, IfElse) and not op.as_view :
print 'ifelse_make_inplace applied'
return IfElse(as_view = True,
gpu = op.gpu, name=op.name).make_node(*node.inputs).outputs
return False
optdb.register('ifelse_make_inplace', opt.in2out(ifelse_make_inplace,
ignore_newtrees=True), 95, 'fast_run', 'inplace')
class IfElse(PureOp):
"""
Op that works with LazyLinker to support conditional graph evaluation.
Example usage:
``rval = ifelse(tf, rval_if_true, rval_if_false)``
:note:
Other Linkers (ALL other linkers right now) are INCOMPATIBLE with this
Op, they will produce functions that FAIL TO EXECUTE.
"""
def __init__(self, as_view=False, gpu = False, name = None):
if as_view:
# check destroyhandler and others to ensure that a view_map with
# multiple inputs can work
view_map = {}
view_map[0] = [1]
self.view_map = view_map
#raise NotImplementedError('IfElse must copy for now')
self.as_view=as_view
self.gpu = gpu
self.name = name
def make_node(self, c, t, f):
if t.type != f.type:
raise TypeError(
'IfElse requires same types for true and false args',
(t.type, f.type))
return Apply(self, [c,t,f], [t.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
outtype = node.outputs[0].type
c,t,f = node.inputs
output = node.outputs[0]
def thunk():
if not compute_map[c][0]:
return [0]
else:
truthval = storage_map[c][0]
if truthval:
if not compute_map[t][0]:
return [1]
else:
compute_map[output][0]=1
if self.as_view:
oval = outtype.filter(storage_map[t][0])
else:
oval = outtype.filter(
deepcopy(storage_map[t][0]))
storage_map[output][0] = oval
return []
else:
if not compute_map[f][0]:
return [2]
else:
# can't view both outputs unless destroyhandler
# improves
compute_map[output][0]=1
oval = outtype.filter(
deepcopy(storage_map[f][0]))
storage_map[output][0]=oval
return []
thunk.lazy = True
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
return thunk
ifelse = IfElse()
...@@ -391,7 +391,7 @@ default_colorCodes = {'GpuFromHost' : 'red', ...@@ -391,7 +391,7 @@ default_colorCodes = {'GpuFromHost' : 'red',
'HostFromGpu' : 'red', 'HostFromGpu' : 'red',
'Scan' : 'yellow', 'Scan' : 'yellow',
'Shape' : 'cyan', 'Shape' : 'cyan',
'Cond' : 'magenta', 'IfElse' : 'magenta',
'Elemwise': '#FFAABB', 'Elemwise': '#FFAABB',
'Subtensor': '#FFAAFF'} 'Subtensor': '#FFAAFF'}
...@@ -473,10 +473,10 @@ def pydotprint(fct, outfile=None, ...@@ -473,10 +473,10 @@ def pydotprint(fct, outfile=None,
c3 = pd.Cluster('Middle') c3 = pd.Cluster('Middle')
cond = None cond = None
for node in fct_env.toposort(): for node in fct_env.toposort():
if node.op.__class__.__name__=='Cond' and node.op.name == cond_highlight: if node.op.__class__.__name__=='IfElse' and node.op.name == cond_highlight:
cond = node cond = node
if cond is None: if cond is None:
_warn("pydotprint: cond_highlight is set but there is no Cond node in the graph") _warn("pydotprint: cond_highlight is set but there is no IfElse node in the graph")
cond_highlight = None cond_highlight = None
if cond_highlight is not None: if cond_highlight is not None:
......
import atexit, logging, os, stat, sys import atexit, logging, os, stat, sys
from theano.compile import optdb from theano.compile import optdb
from theano import config
from theano.gof.cmodule import get_lib_extension from theano.gof.cmodule import get_lib_extension
from theano.configparser import config, AddConfigVar, StrParam
import nvcc_compiler import nvcc_compiler
_logger_name = 'theano.sandbox.cuda' _logger_name = 'theano.sandbox.cuda'
...@@ -20,6 +20,22 @@ def debug(*msg): ...@@ -20,6 +20,22 @@ def debug(*msg):
_logger.debug('DEBUG (%s): %s'% ( _logger_name, _logger.debug('DEBUG (%s): %s'% ( _logger_name,
' '.join(str(m) for m in msg))) ' '.join(str(m) for m in msg)))
AddConfigVar('cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking dynamically
compiled modules. If AUTO, if nvcc is in the path, it will use one of
this parent directory. Otherwise /usr/local/cuda. Leave empty to
prevent extra linker directives.
Default: environment variable "CUDA_ROOT" or else "AUTO".
""",
StrParam(os.getenv('CUDA_ROOT', "AUTO")))
if config.cuda.root == "AUTO":
# set nvcc_path correctly and get the version
nvcc_compiler.set_cuda_root()
#is_nvcc_available called here to initialize global vars in nvcc_compiler module
nvcc_compiler.is_nvcc_available()
# Compile cuda_ndarray.cu # Compile cuda_ndarray.cu
# This need that nvcc (part of cuda) is installed. If it is not, a warning is # This need that nvcc (part of cuda) is installed. If it is not, a warning is
......
...@@ -7,20 +7,7 @@ import commands ...@@ -7,20 +7,7 @@ import commands
_logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler") _logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
_logger.setLevel(logging.WARN) _logger.setLevel(logging.WARN)
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam, BoolParam
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc in this directory",
StrParam(""))
AddConfigVar('cuda.nvccflags',
"Extra compiler flags for nvcc",
StrParam(""))
AddConfigVar('cuda.root',
"The directory with bin/, lib/, include/ for cuda utilities. Used to put this directory of nvidia lib in the compiled libraire. Usefull when people forget to update there LD_LIBRARY_PATH and LIBRARY_PATH environment variable. If AUTO, if nvcc is in the path, it will use one of this parent directory. Otherwise /usr/local/cuda. If empty, won't appen the directory in the compiled library",
StrParam(os.getenv('CUDA_ROOT', "AUTO")))
def error(*args): def error(*args):
#sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n') #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
...@@ -35,6 +22,18 @@ def debug(*args): ...@@ -35,6 +22,18 @@ def debug(*args):
#sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n') #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
_logger.debug("DEBUG: "+' '.join(str(a) for a in args)) _logger.debug("DEBUG: "+' '.join(str(a) for a in args))
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc in this directory",
StrParam(""))
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
StrParam(""))
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False))
nvcc_path = 'nvcc' nvcc_path = 'nvcc'
nvcc_version = None nvcc_version = None
def is_nvcc_available(): def is_nvcc_available():
...@@ -66,11 +65,6 @@ def set_cuda_root(): ...@@ -66,11 +65,6 @@ def set_cuda_root():
config.cuda.root = os.path.split(dir)[0] config.cuda.root = os.path.split(dir)[0]
return return
if config.cuda.root == "AUTO":
set_cuda_root()
is_nvcc_available()#to set nvcc_path correctly and get the version
rpath_defaults = [] rpath_defaults = []
def add_standard_rpath(rpath): def add_standard_rpath(rpath):
rpath_defaults.append(rpath) rpath_defaults.append(rpath)
...@@ -183,11 +177,9 @@ def nvcc_module_compile_str( ...@@ -183,11 +177,9 @@ def nvcc_module_compile_str(
if sys.platform != 'darwin': if sys.platform != 'darwin':
# the 64bit CUDA libs are in the same files as are named by the function above # the 64bit CUDA libs are in the same files as are named by the function above
rpaths.append(os.path.join(config.cuda.root,'lib64')) rpaths.append(os.path.join(config.cuda.root,'lib64'))
for rpath in rpaths: for rpath in rpaths:
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])]) cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag] cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
cmd.extend(nvccflags)
cmd.extend('-I%s'%idir for idir in include_dirs) cmd.extend('-I%s'%idir for idir in include_dirs)
cmd.extend(['-o',lib_filename]) cmd.extend(['-o',lib_filename])
cmd.append(os.path.split(cppfilename)[-1]) cmd.append(os.path.split(cppfilename)[-1])
......
...@@ -270,6 +270,48 @@ def local_gpu_dot_to_dot22(node): ...@@ -270,6 +270,48 @@ def local_gpu_dot_to_dot22(node):
shape_out))] shape_out))]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_lazy_ifelse(node):
"""
gpu_from_host(dot22) -> gpudot(gpu_from_host)
dot(host_from_gpu) -> host_from_gpu(gpudot22)
"""
import theano
if hasattr(theano,"lazycond"):
gpu_ifelse = theano.lazycond.IfElse(gpu = True)
if node.op == gpu_from_host:
host_input = node.inputs[0]
if (host_input.owner
and host_input.owner.op == theano.lazycond.ifelse):
c, t, f = host_input.owner.inputs
if not isinstance(f.type,CudaNdarrayType):
f = gpu_from_host(f)
if not isinstance(t.type,CudaNdarrayType):
t = gpu_from_host(t)
if isinstance(c.type,CudaNdarrayType):
c = host_from_gpu(c)
return [gpu_ifelse(c, t, f)]
if node.op == theano.lazycond.ifelse:
if numpy.any([(i.owner and i.owner.op == host_from_gpu) for i in node.inputs]):
c, t, f = node.inputs
if not isinstance(f.type,CudaNdarrayType):
f = gpu_from_host(f)
if not isinstance(t.type,CudaNdarrayType):
t = gpu_from_host(t)
if isinstance(c.type,CudaNdarrayType):
c = host_from_gpu(c)
return [host_from_gpu(gpu_ifelse(c, t, f))]
return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
......
...@@ -567,7 +567,7 @@ class ScanMerge(gof.Optimizer): ...@@ -567,7 +567,7 @@ class ScanMerge(gof.Optimizer):
def apply(self, env): def apply(self, env):
nodelist = list(env.toposort()) nodelist = list(env.toposort())
cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Cond'] cond_nodes = [ x for x in nodelist if x.op.__class__.__name__=='IfElse']
scan_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Scan'] scan_nodes = [ x for x in nodelist if x.op.__class__.__name__=='Scan']
# Having lazy ifs in the graph complicates a bit things, and for # Having lazy ifs in the graph complicates a bit things, and for
......
...@@ -133,6 +133,79 @@ def sp_ones_like(x): ...@@ -133,6 +133,79 @@ def sp_ones_like(x):
data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape) return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)
class _sparse_py_operators:
T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
def __neg__(self): return neg(self)
def __add__(left, right): return add(left, right)
def __radd__(right, left): return add(left, right)
def __sub__(left, right): return sub(left, right)
def __rsub__(right, left): return sub(left, right)
def __mul__(left, right): return mul(left, right)
def __rmul__(left, right): return mul(left, right)
#extra pseudo-operator symbols
def __dot__(left, right): return structured_dot(left, right)
def __rdot__(right, left): return structured_dot(left, right)
#N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
# Discussion with Fred & James (at least, and maybe others before)
# we decided that casting from a sparse to dense should be explicit
# because it's usually something you want to be pretty careful about,
# and not to do by accident.
#def _as_TensorVariable(self):
# return dense_from_sparse(self)
shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
# ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
# ... and remove the dense_from_sparse from the graph. This will *NOT* actually expand
# ... your sparse matrix just to get the shape.
ndim = property(lambda self: self.type.ndim)
dtype = property(lambda self: self.type.dtype)
class SparseVariable(gof.Variable, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def __str__(self):
return '%s{%s,%s}'%(
self.__class__.__name__,
self.format,
self.dtype)
def __repr__(self):
return str(self)
class SparseConstantSignature(tuple):
def __eq__(self, other):
(a, b), (x,y) = self, other
return a == x \
and (b.dtype == y.dtype)\
and (type(b) == type(y))\
and (b.shape == y.shape)\
and (abs(b-y).sum() < 1e-6 * b.nnz)
def __hash__(self):
(a,b) = self
return hash(type(self)) ^ hash(a) ^ hash(type(b))
class SparseConstant(gof.Constant, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def signature(self):
assert self.data is not None
return SparseConstantSignature((self.type, self.data))
def __str__(self):
return '%s{%s,%s,shape=%s,nnz=%s}'%(
self.__class__.__name__,
self.format,
self.dtype,
self.data.shape,
self.data.nnz)
def __repr__(self):
return str(self)
class SparseValue(gof.Value, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
class SparseType(gof.Type): class SparseType(gof.Type):
""" """
...@@ -149,6 +222,9 @@ class SparseType(gof.Type): ...@@ -149,6 +222,9 @@ class SparseType(gof.Type):
dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128']) dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128'])
ndim = 2 ndim = 2
Variable = SparseVariable
Constant = SparseConstant
def __init__(self, format, dtype): def __init__(self, format, dtype):
""" """
Fundamental way to create a sparse node. Fundamental way to create a sparse node.
...@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64') ...@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64')
csc_fmatrix = SparseType(format='csc', dtype='float32') csc_fmatrix = SparseType(format='csc', dtype='float32')
csr_fmatrix = SparseType(format='csr', dtype='float32') csr_fmatrix = SparseType(format='csr', dtype='float32')
class _sparse_py_operators:
T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
def __neg__(self): return neg(self)
def __add__(left, right): return add(left, right)
def __radd__(right, left): return add(left, right)
def __sub__(left, right): return sub(left, right)
def __rsub__(right, left): return sub(left, right)
def __mul__(left, right): return mul(left, right)
def __rmul__(left, right): return mul(left, right)
#extra pseudo-operator symbols
def __dot__(left, right): return structured_dot(left, right)
def __rdot__(right, left): return structured_dot(left, right)
#N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
# Discussion with Fred & James (at least, and maybe others before)
# we decided that casting from a sparse to dense should be explicit
# because it's usually something you want to be pretty careful about,
# and not to do by accident.
#def _as_TensorVariable(self):
# return dense_from_sparse(self)
shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
# ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
# ... and remove the dense_from_sparse from the graph. This will *NOT* actually expand
# ... your sparse matrix just to get the shape.
ndim = property(lambda self: self.type.ndim)
dtype = property(lambda self: self.type.dtype)
class SparseVariable(gof.Variable, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
class SparseConstantSignature(tuple):
def __eq__(self, other):
(a, b), (x,y) = self, other
return a == x \
and (b.dtype == y.dtype)\
and (type(b) == type(y))\
and (b.shape == y.shape)\
and (abs(b-y).sum() < 1e-6 * b.nnz)
def __hash__(self):
(a,b) = self
return hash(type(self)) ^ hash(a) ^ hash(type(b))
class SparseConstant(gof.Constant, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def signature(self):
assert self.data is not None
return SparseConstantSignature((self.type, self.data))
class SparseValue(gof.Value, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
# CONSTRUCTION # CONSTRUCTION
class CSMProperties(gof.Op): class CSMProperties(gof.Op):
"""Extract all of .data .indices and .indptr""" """Extract all of .data .indices and .indptr"""
......
...@@ -937,6 +937,9 @@ def _gemm_from_node2(node): ...@@ -937,6 +937,9 @@ def _gemm_from_node2(node):
lst = _factor_canonicalized(lst) lst = _factor_canonicalized(lst)
rval = _gemm_from_factored_list(lst) rval = _gemm_from_factored_list(lst)
#print "RVAL", rval #print "RVAL", rval
# THIS GOT COMMENTED OUT AT SOME POINT - ASK P.Lamblin maybe why?
#if rval:
# assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
if rval and (rval[0].type == node.outputs[0].type): if rval and (rval[0].type == node.outputs[0].type):
return rval return rval
......
...@@ -3057,30 +3057,33 @@ def constant_folding(node): ...@@ -3057,30 +3057,33 @@ def constant_folding(node):
for input in node.inputs: for input in node.inputs:
if not isinstance(input, Constant): if not isinstance(input, Constant):
return False return False
try: #condition: all inputs are constant
storage = [[None] for output in node.outputs]
node.op.perform(node, [x.data for x in node.inputs], storage) storage_map=dict([(i,[i.data]) for i in node.inputs])
except MethodNotDefined: compute_map=dict([(i,[True]) for i in node.inputs])
tmp_inputs = [x.type() for x in node.inputs] for o in node.outputs:
f = compile.function( storage_map[o] = [None]
inputs=tmp_inputs, compute_map[o] = [False]
outputs=node.op.make_node(*tmp_inputs).outputs,
mode=compile.Mode(linker='c|py',optimizer=None)) thunk = node.op.make_thunk(node, storage_map, compute_map,
xvals = f(*[x.data for x in node.inputs]) no_recycling=[])
storage = [[xv] for xv in xvals]
required = thunk()
msg = [] assert not required # a node whose inputs are all provided should always
assert len(storage) == len(node.outputs) # return successfully
for s, output in zip(storage, node.outputs):
rval = []
for output in node.outputs:
assert compute_map[output][0], (output, storage_map[output][0])
try: try:
constant = output.type.Constant constant = output.type.Constant
except: except AttributeError:
constant = Constant constant = Constant
msg += [constant(output.type, s[0])] rval.append(constant(output.type, storage_map[output][0]))
return msg return rval
register_canonicalize(constant_folding, 'fast_compile') register_canonicalize(constant_folding, 'fast_compile')
register_stabilize(constant_folding) # because register_stabilize(constant_folding)
register_specialize(constant_folding) register_specialize(constant_folding)
def _is_1(expr): def _is_1(expr):
......
...@@ -20,7 +20,7 @@ def test_no_reuse(): ...@@ -20,7 +20,7 @@ def test_no_reuse():
return return
assert not 'should not get here' assert not 'should not get here'
def test_gc(): def test_gc_never_pickles_temporaries():
x = T.dvector() x = T.dvector()
#print >> sys.stderr, 'BUILDING GRAPH' #print >> sys.stderr, 'BUILDING GRAPH'
...@@ -32,32 +32,63 @@ def test_gc(): ...@@ -32,32 +32,63 @@ def test_gc():
optimizer=None optimizer=None
optimizer='fast_run' optimizer='fast_run'
for f_linker, g_linker in [ for f_linker, g_linker in [
(theano.PerformLinker(allow_gc = True), theano.PerformLinker(allow_gc=False)), (theano.PerformLinker(allow_gc = True), theano.PerformLinker(allow_gc=False)),
(theano.OpWiseCLinker(allow_gc = True), theano.OpWiseCLinker(allow_gc=False))]: (theano.OpWiseCLinker(allow_gc = True), theano.OpWiseCLinker(allow_gc=False))]:
#f_linker has garbage collection
#g_linker has no garbage collection
#print >> sys.stderr, 'COMPILING' #print >> sys.stderr, 'COMPILING'
f = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker)) f = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker))
g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=g_linker))
len_pre_f = len(cPickle.dumps(f))
len_pre_g = len(cPickle.dumps(g))
# should be no difference at first
# In future, FunctionMaker might pickle linker-dependent stuff and make
# this assertion fail.
assert len_pre_f == len_pre_g
def a(fn):
return len(cPickle.dumps(fn.maker))
assert a(f) == a(f) # some sanity checks on the pickling mechanism
assert a(g) == a(g) # some sanity checks on the pickling mechanism
g = theano.function([x], r,mode=theano.Mode(optimizer=optimizer, linker=f_linker)) def b(fn):
return len(
cPickle.dumps(
theano.compile.function_module._pickle_Function(
fn)))
assert b(f) == b(f) # some sanity checks on the pickling mechanism
pre_f = cPickle.dumps(f) def c(fn):
pre_g = cPickle.dumps(g) return len(cPickle.dumps(fn))
assert c(f) == c(f) # some sanity checks on the pickling mechanism
assert c(g) == c(g) # some sanity checks on the pickling mechanism
#print >> sys.stderr, 'RUNNING'
# now run the function once to create temporaries within the no-gc
# linker
f(numpy.ones(100, dtype='float64')) f(numpy.ones(100, dtype='float64'))
g(numpy.ones(100, dtype='float64')) g(numpy.ones(100, dtype='float64'))
# serialize the functions again
post_f = cPickle.dumps(f) post_f = cPickle.dumps(f)
post_g = cPickle.dumps(g) post_g = cPickle.dumps(g)
#because allow_gc should leave the function un-changed by calling
assert len(pre_f) == len(post_f)
#because temporaries that weren't collected shouldn't be pickled anyway
len_post_f = len(post_f) len_post_f = len(post_f)
len_post_g = len(post_g) len_post_g = len(post_g)
assert len_post_f == len_post_g
#assert that f() didn't cause the function to grow
# allow_gc should leave the function un-changed by calling
assert len_pre_f == len_post_f
#assert that g() didn't cause g to grow
# because temporaries that weren't collected shouldn't be pickled anyway
assert len_post_f == len_post_g, (f_linker, len_post_f, len_post_g)
def test_merge_opt_runtime(): def test_merge_opt_runtime():
......
...@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase): ...@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase):
rng_R = random_state_type() rng_R = random_state_type()
# use make_node to override some of the self.args # use make_node to override some of the self.args
post_r2, out2 = rf2(rng_R, (4,), -2, 2) post_r2, out2 = rf2(rng_R, (4,), -2, 2) # NOT INPLACE
post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2) post_r4, out4 = rf4(rng_R, (4,), -4, 4) # INPLACE
post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2) # NOT INPLACE
post_r4, out4 = rf4(rng_R, (4,), -4, 4) post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) # NOT INPLACE
# configure out4 to be computed inplace
# The update expression means that the random state rng_R will
# be maintained by post_r4
f = compile.function( f = compile.function(
[compile.In(rng_R, [compile.In(rng_R,
value=numpy.random.RandomState(utt.fetch_seed()), value=numpy.random.RandomState(utt.fetch_seed()),
...@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase): ...@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase):
f2, f4, f2_4, f2_4_4 = f() f2, f4, f2_4, f2_4_4 = f()
f2b, f4b, f2_4b, f2_4_4b = f() f2b, f4b, f2_4b, f2_4_4b = f()
assert numpy.allclose(f2*2, f4) print f2
assert numpy.allclose(f2_4_4, f4) print f4
assert not numpy.allclose(f4, f4b) print f2_4
print f2_4_4
#print f2b
#print f4b
#print f2_4b
#print f2_4_4b
# setting bounds is same as multiplying by 2
assert numpy.allclose(f2*2, f4), (f2, f4)
# retrieving from non-inplace generator
# is same as inplace one for first call
assert numpy.allclose(f2_4_4, f4), (f2_4_4, f4)
# f4 changes from call to call, that the update has worked
assert not numpy.allclose(f4, f4b), (f4, f4b)
def test_inplace_optimization(self): def test_inplace_optimization(self):
"""Test that FAST_RUN includes the random_make_inplace optimization""" """Test that FAST_RUN includes the random_make_inplace optimization"""
......
...@@ -13,19 +13,32 @@ from theano.tests import unittest_tools as utt ...@@ -13,19 +13,32 @@ from theano.tests import unittest_tools as utt
should ensure that it will remain operational should ensure that it will remain operational
''' '''
class T_diverse(unittest.TestCase): class T_scipy(unittest.TestCase):
def setUp(self): def setUp(self):
utt.seed_rng() utt.seed_rng()
self.orig_floatX = theano.config.floatX
def tearDown(self):
theano.config.floatX = self.orig_floatX
def scipy_paper_example1(self): def test_scipy_paper_example1(self):
a = theano.tensor.vector('a') # declare variable a = theano.tensor.vector('a') # declare variable
b = a + a**10 # build expression b = a + a**10 # build expression
f = theano.function([a], b) # compile function f = theano.function([a], b) # compile function
assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026])) assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
def scipy_papaer_example2(self): def test_scipy_paper_example2(self):
''' This just sees if things compile well and if they run ''' ''' This just sees if things compile well and if they run '''
# PREAMPBLE
T = theano.tensor
shared = theano.shared
function = theano.function
rng = numpy.random
theano.config.floatX='float64'
#
# ACTUAL SCRIPT FROM PAPER
x = T.matrix() x = T.matrix()
y = T.vector() y = T.vector()
w = shared(rng.randn(100)) w = shared(rng.randn(100))
...@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase): ...@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase):
for i in range(training_steps): for i in range(training_steps):
pred, err = train(D[0], D[1]) pred, err = train(D[0], D[1])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论