提交 95a48402 authored 作者: Razvan Pascanu's avatar Razvan Pascanu

Added gradient computation to scan

上级 8d1929c2
...@@ -37,8 +37,9 @@ import gradient ...@@ -37,8 +37,9 @@ import gradient
from gof.python25 import all from gof.python25 import all
import copy import copy
import tensor.elemwise as elemwise import tensor.elemwise as elemwise
import printing
import numpy import numpy
import theano
# Logging function for sending warning or info # Logging function for sending warning or info
import logging import logging
...@@ -49,8 +50,7 @@ def info(*msg): ...@@ -49,8 +50,7 @@ def info(*msg):
_logger.info('INFO theano.scan: '+' '.join(msg)) _logger.info('INFO theano.scan: '+' '.join(msg))
# Hashing a dictionary or a list or a tuple or any type that is hashable with # Hashing a dictionary/list/tuple by going and hasing each element
# the hash() function
def hash_listsDictsTuples(x): def hash_listsDictsTuples(x):
hash_value = 0 hash_value = 0
if type(x) == dict : if type(x) == dict :
...@@ -133,6 +133,7 @@ def reduce(fn, sequences, outputs_info, non_sequences = [], go_backwards = False ...@@ -133,6 +133,7 @@ def reduce(fn, sequences, outputs_info, non_sequences = [], go_backwards = False
outs_info[i] = dict(initial = out_info, return_steps = 1) outs_info[i] = dict(initial = out_info, return_steps = 1)
else: else:
# we tell scan to store only the last step # we tell scan to store only the last step
# this will implicitly tell scan to also return just that
outs_info[i]['store_steps'] = 1 outs_info[i]['store_steps'] = 1
# NOTE : Maybe some errors can be detected here and # NOTE : Maybe some errors can be detected here and
# we could give more meaningfull error messages then in scan ? # we could give more meaningfull error messages then in scan ?
...@@ -236,17 +237,16 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -236,17 +237,16 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
* all time slices of the second otuput (as given in the * all time slices of the second otuput (as given in the
``initial_state`` list) ordered in the same fashion as the time taps provided ``initial_state`` list) ordered in the same fashion as the time taps provided
* ... * ...
* all other parameters over which scan doesn't iterate given * all other parameters over which scan doesn't iterate ordered accordingly
in the same order as in ``non_sequences`` If you are using shared If you are using shared variables over which you do not want to iterate,
variables over which you do not want to iterate, you do not need to you do not need to provide them as arguments to ``fn``, though you can if you
provide them as arguments to ``fn``, though you can if you wish so. The wish so. The function should return the outputs after each step plus the updates for
function should return the outputs after each step plus the updates for
any of the shared variables. You can either return only outputs or only any of the shared variables. You can either return only outputs or only
updates. If you have both outputs and updates the function should return updates. If you have both outputs and updates the function should return
them as a tuple : (outputs, updates) or (updates, outputs). them as a tuple : (outputs, updates) or (updates, outputs).
Outputs can be just a theano expression if you have only one outputs or Outputs can be just a theano expression if you have only one output or
a list of theano expressions. Updates can be given either as a list of tuples or a list of theano expressions. Updates can be given either as a list of tuples or
as a dictionary. If you have a list of outputs, the order of these as a dictionary. If you have a list of outputs, the order of these
should match that of their ``initial_states``. should match that of their ``initial_states``.
...@@ -283,10 +283,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -283,10 +283,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
* ``inplace`` -- theano variable pointing to one of the input sequences; this * ``inplace`` -- theano variable pointing to one of the input sequences; this
flag tells scan that the output should be computed in the memory spaced occupied flag tells scan that the output should be computed in the memory spaced occupied
by that input sequence. Note that scan will only do this if allowed by the by that input sequence. Note that scan will only do this if allowed by the
rest of your computational graph. rest of your computational graph and if you are not using past taps of the input.
* ``return_steps`` how many steps to return from your output. If not given, or * ``return_steps`` how many steps to return from your output. If not given, or
0 scan will return all steps, otherwise it will return the last ``return_steps``. 0 scan will return all steps, otherwise it will return the last ``return_steps``.
Note that if you set this to something else then 0, scan will always be smart Note that if you set this to something else then 0, scan will try to be smart
about the amount of memory it allocates for a given input. about the amount of memory it allocates for a given input.
If the function applied recursively uses only the If the function applied recursively uses only the
...@@ -308,16 +308,16 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -308,16 +308,16 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
* if you do not wrap an output in a dictionary, scan will wrap it for you * if you do not wrap an output in a dictionary, scan will wrap it for you
assuming that you use only the last step of the output ( i.e. it makes your tap assuming that you use only the last step of the output ( i.e. it makes your tap
value list equal to [-1]) and that it is not computed inplace value list equal to [-1]) and that it is not computed inplace
* if you wrap an output in a dictionary but you do not provide any taps, but * if you wrap an output in a dictionary and you do not provide any taps but
you provide an initial state it will assume that you are using only a tap value you provide an initial state it will assume that you are using only a tap value
of -1 of -1
* if you wrap an output in a dictionary but you do not provide any initial state, * if you wrap an output in a dictionary but you do not provide any initial state,
it assumes that you are not using any form of taps it assumes that you are not using any form of taps
* if you provide a ``None`` scan assumes that you will not use any taps for this * if you provide a ``None`` scan assumes that you will not use any taps for this
output output (this would be the case for map )
If you did not provide any information for your outputs, scan will assume by default If you did not provide any information for your outputs, scan will assume by default
that you are not using any taps for any of the input. If you provide information for that you are not using any taps for any of the outputs. If you provide information for
just a subset of outputs, scan will not know to which outputs these information just a subset of outputs, scan will not know to which outputs these information
corresponds and will raise an error. corresponds and will raise an error.
...@@ -332,7 +332,7 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -332,7 +332,7 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
the input sequences. If the value is 0, the outputs will have 0 rows. If the the input sequences. If the value is 0, the outputs will have 0 rows. If the
value is negative, scan will run backwards (or if the flag go_backwards is value is negative, scan will run backwards (or if the flag go_backwards is
already set to true it will run forward in time). If n_steps is not provided, already set to true it will run forward in time). If n_steps is not provided,
or evaluetes not None, scan will figure out the maximal amount of steps it can or evaluetes to None, inf or nan, scan will figure out the maximal amount of steps it can
take and do that. take and do that.
:param truncate_gradient: :param truncate_gradient:
...@@ -346,17 +346,18 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -346,17 +346,18 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
Flag indicating if you should go backwards through the sequences Flag indicating if you should go backwards through the sequences
:param name: :param name:
The name of the theano fct compiled by the Scan op. It will show in the The name of the theano function compiled by the Scan op. It will show in the
profiler output. profiler output.
:param mode: :param mode:
The mode used when compiling the theano fct in the Scan op. The mode used when compiling the theano function in the Scan op.
If None will use the config mode. If None will use the config mode.
If None and the config mode is a a profile mode, we will create a new instance If None and the config mode is a a profile mode, we will create a new instance
to compute correctly the timming. to compute correctly the timming.
Otherwise we the time of the Scan op will show into the Scan op and the Otherwise the time spend in Scan will show up twice in the profiling, once
time spent inside the Scan op fct will also show op. The new profiler instance as the time taken by scan, and a second time as taken by the individial ops
will be printed when python exit. that scan calls to do a iteration step.
The new profiler instance will be printed when python exits.
:rtype: tuple :rtype: tuple
:return: tuple of the form (outputs, updates); ``outputs`` is either a :return: tuple of the form (outputs, updates); ``outputs`` is either a
...@@ -366,6 +367,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -366,6 +367,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
operation; this dictionary should be pass to ``theano.function`` operation; this dictionary should be pass to ``theano.function``
""" """
# General observation : this code is executed only once, at creation
# of the computational graph, so we don't yet need to be smart about
# anything ( to speed things up)
# check if inputs are just single variables instead of lists # check if inputs are just single variables instead of lists
if not (type(sequences) in (list, tuple)): if not (type(sequences) in (list, tuple)):
seqs = [sequences] seqs = [sequences]
...@@ -383,25 +388,70 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -383,25 +388,70 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
non_seqs = non_sequences non_seqs = non_sequences
# If we provided a known number of steps ( before compilation)
# and if that number is 1 or -1, then we can skip the Scan Op,
# and just apply the inner function once
# To do that we check here to see the nature of n_steps
if type(n_steps) in (float,int):
n_fixed_steps = int(n_steps)
else:
# also check if this value happens to be a constant,
# then we could do the same
try :
n_fixed_steps = opt.get_constant_value(n_steps)
except:
n_fixed_steps = None
# compute number of sequences and number of outputs # compute number of sequences and number of outputs
n_seqs = len(seqs) n_seqs = len(seqs)
n_outs = len(outs_info) n_outs = len(outs_info)
# initialize the inplace map, sequences map and
# outputs map
''' Details:
The scan op identifies different properties attached
to input tensors by their order in the input list.
These maps ( inplace, sequence_taps, output_taps,
store_steps, return_steps) go from the index of an input to
its properties. Note that inputs are always first, followed
by outputs. Since we always know the number of inputs we
index the outputs from 0 ( so sometimes you will need to
do something like outputs_taps[i-n_ins]
'''
inplace_map = {} inplace_map = {}
sequences_taps = {} sequences_taps = {}
outputs_taps = {} outputs_taps = {}
# wrap sequences in a dictionary if they are not already
# Assume that for any output we want to store everythin that it produces
store_steps = []
return_steps = {}
# wrap sequences in a dictionary if they are not already dictionaries
# in the same pass create a sequences_taps dictionary # in the same pass create a sequences_taps dictionary
for i in xrange(n_seqs): for i in xrange(n_seqs):
if not type(seqs[i]) == dict : if not type(seqs[i]) == dict :
# if it is not a dictionary make it into one
seqs[i] = dict(input=seqs[i], taps=[0]) seqs[i] = dict(input=seqs[i], taps=[0])
# see if taps values are provided as a list # see if taps values are provided as a list
elif seqs[i].get('taps',None): elif seqs[i].get('taps',None):
# users can optionally provide the past value (if is just
# one) as a number instead of a list. Wrap it in a list
# to have a uniform way of dealing with inputs later on
if not type(seqs[i]['taps']) in (tuple,list): if not type(seqs[i]['taps']) in (tuple,list):
seqs[i]['taps'] = [seqs[i]['taps']] seqs[i]['taps'] = [seqs[i]['taps']]
else: else:
# See if the user actually provided the None value to taps,
# which would indicate that the sequence was provided but
# not used by the internal function; Only if the user has
# not provided anything add the defaul [0]
# Possible reason to provide a squence and not use it is
# if you want to compute the output
# inplace of this input; it is a very unlikely behaviour but
# we do want to cover it for completeness
if not seqs[i].has_key('taps'):
seqs[i][taps] = [0] seqs[i][taps] = [0]
# Now that our input is well behaved, collect the taps in the
# sequences_taps map that we will use later in the body of scan
# since inputs will be just tensors there
if seqs[i].get('taps',None): if seqs[i].get('taps',None):
sequences_taps[i] = seqs[i]['taps'] sequences_taps[i] = seqs[i]['taps']
...@@ -409,21 +459,58 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -409,21 +459,58 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
# in the same pass create a init_outs_taps dictionary and a inplace map # in the same pass create a init_outs_taps dictionary and a inplace map
for i in xrange(n_outs): for i in xrange(n_outs):
if outs_info[i]: if outs_info[i]:
# If output is a dictionary, collect the number of steps the
# user would like scan to return
if type(outs_info[i]) == dict:
if outs_info[i].get('return_steps', None):
return_steps[i] = outs_info[i]['return_steps']
# If you provide the number of steps to store internally,
# (not advocated in the user documentation), then also
# make sure you are returning only those number of steps
if outs_info[i].get('store_steps', None):
store_steps += [outs_info[i].get('store_steps',None)]
return_steps[i] = outs_info[i].get('store_steps',None)
else:
store_steps += [0]
else:
store_steps += [0]
# trying to collect taps of the output
if not type(outs_info[i]) == dict: if not type(outs_info[i]) == dict:
# by default any output has a tap value of -1
outs_info[i] = dict(initial=outs_info[i], taps = [-1]) outs_info[i] = dict(initial=outs_info[i], taps = [-1])
# if there is no initial state but there are taps # if there is no initial state but there are taps
# then return an error because it makes no sense
elif (not outs_info[i].get('initial',None)) and(outs_info[i].get('taps',None)): elif (not outs_info[i].get('initial',None)) and(outs_info[i].get('taps',None)):
raise ValueError('If you are using slices of an output you need to '\ raise ValueError('If you are using slices of an output you need to '\
'provide a initial state for it', outs_info[i]) 'provide a initial state for it', outs_info[i])
elif outs_info[i].get('initial',None) and (not outs_info[i].get('taps',None)): # if there is an intial state but no tap, we will add the default value for
# taps, namely [-1] ( previous value); not that this will happen even though
# you have provided for taps the value None, which is a bit strange (why would
# one provide an initial state but tell scan not to use it ? ), just that
# in that case we will throw in a warning message pointing out this inconsistency
elif outs_info[i].get('initial',None) and ( not outs_info[i].get('taps',None)):
if outs_info[i].has_key('taps'):
warning('You are providing a initial state for an output, but yet tell scan'
'not to use it. Why? Scan will overwrite this setting and use the previous'
'value of the provided initial state. If this is not what you wanted, check'
'your code and do not provide the initial state')
outs_info[i]['taps'] = [-1] outs_info[i]['taps'] = [-1]
else: else:
# if the output is a None then replace it with an empty dictionary for easing
# up dealing with this case later one ( we can directly call .has_key and things
# like this
outs_info[i] = dict() outs_info[i] = dict()
store_steps += [0]
if outs_info[i].get('taps', None): if outs_info[i].get('taps', None):
# Create a separate outputs_taps dictionary with all the outputs taps; This
# is how the Scan Op expects this information, separeted from the variables
outputs_taps[i] = outs_info[i]['taps'] outputs_taps[i] = outs_info[i]['taps']
if outs_info[i].get('inplace', None): if outs_info[i].get('inplace', None):
# look for that variable to get the index # The same is true for the inplace info; it has to go into a separate dictionary
# based on index; Note that the input we're replacing should also come as an
# index, therefore we have to look for it here
found = None found = None
for k in xrange(n_seqs): for k in xrange(n_seqs):
if seqs[k].get('input', None) == outs_info[i].get('inplace',None): if seqs[k].get('input', None) == outs_info[i].get('inplace',None):
...@@ -440,52 +527,108 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -440,52 +527,108 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
# note : this is a first batch of possible inputs that will # note : this is a first batch of possible inputs that will
# be compiled in a dummy function; we used this dummy # be compiled in a dummy function; we used this dummy
# function to detect shared variables and their updates # function to detect shared variables and their updates
# and to construct a new list of possible inputs # and to construct a new and complete list of inputs and outputs
args = [] args = [] # list of arguments
dummy_notshared_ins = 0 dummy_notshared_ins = 0 # number of arguments corresponding to input sequences
dummy_notshared_init_outs = 0 dummy_notshared_init_outs = 0 # number of arguments corresponding to output sequences
slice_to_seqs = [] slice_to_seqs = [] # for each slice index of the corresponding input
# go through sequences picking up time slices as needed # go through sequences picking up time slices as needed
for i,seq in enumerate(seqs): for i,seq in enumerate(seqs):
# Note that you can have something like no taps for
# a sequence, though is highly unlikely in practice
if seq.get('taps', None): if seq.get('taps', None):
# go through the indicated slice
mintap = numpy.min(seq['taps'])
for k in seq['taps']: for k in seq['taps']:
# create one slice of the input
'''
Later on, if we decide not to use scan because we are going
for just one step, it makes things easier if we compute the
correct outputs here. This way we can use the output of the
lambda expression directly to replace the output of scan.
If not we need to use copies, that will be replaced at each
frame by the corresponding slice
'''
if n_fixed_steps not in [1,-1]:
nw_slice = seq['input'][0].type() nw_slice = seq['input'][0].type()
elif n_fixed_steps == 1:
nw_slice = seq['input'][k-mintap]
else:
nw_slice = seq['input'][-1+mintap-k]
# Add names to slices for debugging and pretty printing .. # Add names to slices for debugging and pretty printing ..
# that is if the input already has a name
if seq['input'].name: if seq['input'].name:
nw_slice.name = seq['input'].name + '[%d]'%seq['taps'][k] if seq['taps'][k] > 0:
nw_slice.name = seq['input'].name + '[t+%d]'%seq['taps'][k]
elif seq['taps'][k] == 0:
nw_slice.name = seq['input'].name + '[t]'
else:
nw_slice.name = seq['input'].name + '[t%d]'%seq['taps'][k]
args.append(nw_slice) args.append(nw_slice)
# Specify to whom this slice belongs
slice_to_seqs.append(i) slice_to_seqs.append(i)
# Any slice is not a shared variable, even though the sequence
# from where we pick the slices is shared, therefore we should
# increase the number of notshared inputs to the dummy function
# by the number of slices
dummy_notshared_ins += len(seq['taps']) dummy_notshared_ins += len(seq['taps'])
# go through outputs picking up time slices as needed # go through outputs picking up time slices as needed
for i,init_out in enumerate(outs_info): for i,init_out in enumerate(outs_info):
# Note that our convention dictates that if an output uses
# just the previous time step, as a initial state we will only provide
# a tensor of the same dimension as one time step; This makes code
# much cleaner for those who do not use taps. Otherwise they would
# always had to shape_pad_left the initial state .. which is ugly
if init_out.get('taps', None) == [-1]: if init_out.get('taps', None) == [-1]:
if n_fixed_steps in [-1,1]:
args += [init_out['initial']]
else:
args += [init_out['initial'].type()] args += [init_out['initial'].type()]
# Added name to slices for debugging and pretty printing # Added name to slices for debugging and pretty printing
if init_out['initial'].name: if init_out['initial'].name:
args[-1].name = init_out['initial'].name+'[-1]' args[-1].name = init_out['initial'].name+'[t-1]'
if slice_to_seqs: # we need to specify in slice_seqs to which output this
val = slice_to_seqs[-1] # slice belongs; Because we might get confused afterwards
else: # if a number is an index of a sequence or an output, and
val = -1 # because we do not want to create yet another list, we will
slice_to_seqs += [ val+1 ] # add the number of sequences + the current output. This makes
# decoding easy and spares us from writing a lot of lines
slice_to_seqs += [ i+n_seqs ]
dummy_notshared_init_outs += 1 dummy_notshared_init_outs += 1
elif init_out.get('taps',None): elif init_out.get('taps',None):
if numpy.any(numpy.array(init_out.get('taps',[])) > 0): if numpy.any(numpy.array(init_out.get('taps',[])) > 0):
# Make sure we do not have requests for future values of a sequence
# we can not provide such values
raise ValueError('Can not use future taps of outputs', init_out) raise ValueError('Can not use future taps of outputs', init_out)
if slice_to_seqs: # go through the taps
val = slice_to_seqs[-1] minstep = abs(numpy.min(init_out['taps']))
else:
val = -1
for k in init_out['taps']: for k in init_out['taps']:
# create a new slice
if n_fixed_steps in [1,-1]:
nw_slice = init_out['initial'][k+minstep]
else:
nw_slice = init_out['initial'][0].type() nw_slice = init_out['initial'][0].type()
# give it a name or debugging and pretty printing
if init_out['initial'].name: if init_out['initial'].name:
nw_slice.name = init_out['initial'].name + '[%d]'%init_out['taps'][k] if k > 0:
nw_slice.name = init_out['initial'].name + '[t+%d]'%k
elif k == 0:
nw_slice.name = init_out['initial'].name + '[t]'
else:
nw_slice.name = init_out['initial'].name + '[t%d]'%k
args.append(nw_slice) args.append(nw_slice)
slice_to_seqs.append(val+1) # indicate the output index + n_seqs ( see above why)
slice_to_seqs.append(i + n_seqs)
# add as many slices as there are taps
dummy_notshared_init_outs += len(init_out['taps']) dummy_notshared_init_outs += len(init_out['taps'])
#NOTE: there is another case, in which we do not want to provide any previous
# value of the output to the inner case; in this case we do not have to do
# anything ..
# remove shared variables from the non sequences list # remove shared variables from the non sequences list
# such that we can compile the function ( the user has the option to add them when writing
# scan, because in some situations this might make the code more readable)
notshared_other_args = [] notshared_other_args = []
for non_seq in non_seqs: for non_seq in non_seqs:
if not isinstance(non_seq, SharedVariable): if not isinstance(non_seq, SharedVariable):
...@@ -493,12 +636,22 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -493,12 +636,22 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
# add only the not shared variables to the arguments of the dummy # add only the not shared variables to the arguments of the dummy
# function [ a function should not get shared variables as input ] # function [ a function should not get shared variables as input ]
dummy_args = args + notshared_other_args dummy_args = []
for arg in args:
if not isinstance(arg, SharedVariable):
dummy_args += [arg]
dummy_args += notshared_other_args
# arguments for the lambda expression that gives us the output # arguments for the lambda expression that gives us the output
# of the inner function # of the inner function
args += non_seqs args += non_seqs
# when we apply the lambda expression we get a mixture of update rules
# and outputs that needs to be separated
outputs_updates = fn(*args) outputs_updates = fn(*args)
# The code that follows tries to be as flexible as possible allowing the
# user to return the output and updates in any order, and giving the updates
# however he wants ( as a dictionary or a list o pairs ..)
# Is there a way to compress all this by writing it in a more python/functional way?
outputs = [] outputs = []
updates = {} updates = {}
# we will try now to separate the outputs from the updates # we will try now to separate the outputs from the updates
...@@ -534,99 +687,139 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -534,99 +687,139 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
else: else:
outputs = outputs_updates outputs = outputs_updates
updates = {} updates = {}
# in case you return a tuple .. convert it to a list (there are certain # in case you return a tuple .. convert it to a list (there are certain
# operation that are not permited on tuples, like element assignment) # operation that are not permited on tuples, like element assignment)
outputs = list(outputs) outputs = list(outputs)
# If you return numbers (highly unlikely) this will not go well for theano
# We need to convert them to Theano constants
for i,out in enumerate(outputs): for i,out in enumerate(outputs):
outputs[i] = tensor.as_tensor(out) outputs[i] = tensor.as_tensor(out)
# Wo compile a dummy function just to see what shared variable
# we have and what are their update rules
dummy_f = function(dummy_args, outputs, updates = updates, mode = \ # We can now compile a dummy function just to see what shared variable
# we have and what are their update rules (note that the user has
# the option not to pass the shared variable to scan, so we need to
# pick them manually and add them to scan)
# make the compilation as fast as possible by not applying any optimization
# or conversion to C [ note this region is not important for performance
# so we can do stuff as unoptimal as we wish ]
dummy_f = function(filter(lambda x: isinstance(x,gof.Variable) and \
not isinstance(x,SharedVariable) and not isinstance(x,gof.Constant), \
reversed(gof.graph.inputs(dummy_args))), outputs, updates = updates, mode = \
compile.mode.Mode(linker = 'py', optimizer = None) ) compile.mode.Mode(linker = 'py', optimizer = None) )
inner_fn_out_states = [ out.variable for out in dummy_f.maker.outputs] # We now look at what outputs our function returns
inner_fn_outs = [ out.variable for out in dummy_f.maker.outputs]
update_map = {} update_map = {}
shared_outs = [] shared_outs = []
shared_non_seqs = [] shared_non_seqs = []
givens = {} givens = {}
# if the number of outputs to the function does not match the number of # if the number of outputs to the function does not match the number of
# assumed outputs # assumed outputs until now (provided by the initial case) there can be
# only one explanation that we now how to deal with. Namely no information
if len(inner_fn_out_states) != n_outs: # is provided for any outputs which will indicate that we deal with a map,
# i.e. we never use previous values of outputs
if len(inner_fn_outs) != n_outs:
if outs_info == []: if outs_info == []:
# We know how to deal with this case, assume that none of the outputs # We know how to deal with this case, assume that none of the outputs
# are required to have any sort of time taps # are required to have any sort of time taps
# we just need to update the number of actual outputs # we just need to update the number of actual outputs
n_outs = len(inner_fn_out_states) n_outs = len(inner_fn_outs)
# other updates : # other updates :
for i in xrange(n_outs): for i in xrange(n_outs):
outs_info += [ dict() ] outs_info += [ dict() ]
# we also need to re-initialize the store_steps list to match the
# number of outputs
store_steps = [ 0 for i in xrange(n_outs)]
else: else:
# Otherwise there is a bit of confusion, since Scan works on the index of
# a sequence /output. There are maybe corner cases that could be added here
# or defult behaviour ( like always add the extra outputs at the end !?)
# But I did not bother implementing this, I leave it to the user to clearly
# express what he/she wants to do
raise ValueError('There has been a terrible mistake in our input arguments' raise ValueError('There has been a terrible mistake in our input arguments'
' and scan is totally lost. Make sure that you indicate for every ' ' and scan is totally lost. Make sure that you indicate for every '
' output what taps you want to use, or None, if you do not want to ' ' output what taps you want to use, or None, if you do not want to '
' use any !') ' use any !')
inner_fn_inputs=[input.variable for input in \ inner_fn_inputs=[input.variable for input in \
dummy_f.maker.expanded_inputs[:dummy_notshared_ins+dummy_notshared_init_outs]] dummy_f.maker.expanded_inputs[:dummy_notshared_ins+dummy_notshared_init_outs]]
fromIdx = dummy_notshared_ins + dummy_notshared_init_outs
store_steps = [ 0 for i in xrange(n_outs)]
for i in xrange(n_outs): # Keep track of the range (place) where you insert shared variables with updates
if outs_info[i].get('return_steps', None): # Because we will not be able to compute the gradient with respect to those variables
store_steps[i] = outs_info[i]['return_steps'] # inner_fn_notshared_ins_idx is from where these shared variables with updates start
inner_fn_notshared_ins_idx = dummy_notshared_ins + dummy_notshared_init_outs
# add shared variable that act as outputs # Because scan is particularly sensitive at the order in which it gets its
# # arguments, we need to separete the shared variables that act as outputs
# from those that are not outputs of the network as well
n_extended_outs = n_outs n_extended_outs = n_outs
# Skip the slices that we've added to the inner_fn which will be the first elements
# of f.maker.epanded_inputs and which we know that are not shared
fromIdx = dummy_notshared_ins + dummy_notshared_init_outs
for input in dummy_f.maker.expanded_inputs[fromIdx:] : for input in dummy_f.maker.expanded_inputs[fromIdx:] :
# If input is a shared variable that gets updated, then
# this shared variable will be an output of our inner function
if isinstance(input.variable, SharedVariable) and input.update: if isinstance(input.variable, SharedVariable) and input.update:
# Create a copy of it
new_var = input.variable.type() new_var = input.variable.type()
inner_fn_inputs.append(new_var) inner_fn_inputs.append(new_var)
if slice_to_seqs: # add it to the slices at the end
val = slice_to_seqs[-1] slice_to_seqs += [ n_extended_outs ]
else: val = -1 inner_fn_outs += [input.update]
slice_to_seqs += [ val+1 ]
inner_fn_out_states += [input.update]
update_map[ input.variable ] = n_extended_outs update_map[ input.variable ] = n_extended_outs
# We know that we only have access to the last step
outputs_taps[ n_extended_outs ] = [-1] outputs_taps[ n_extended_outs ] = [-1]
n_extended_outs += 1 n_extended_outs += 1
# we shouldn't try to store more then the last step
# this might not even be a tensor ! ( RandomState )
store_steps += [1] store_steps += [1]
return_steps[n_extended_outs -1] = 1
shared_outs += [input.variable] shared_outs += [input.variable]
givens[input.variable] = inner_fn_inputs[-1] givens[input.variable] = inner_fn_inputs[-1]
# inner_fn_shared_ins_idx stores where we stop having shared variables with updates
inner_fn_shared_ins_idx = len(inner_fn_inputs) - inner_fn_notshared_ins_idx
# add the rest: # Now that we took out the shared variables that have an update rule
# we need to take care of all the other shared variables
for input in dummy_f.maker.expanded_inputs[fromIdx:] : for input in dummy_f.maker.expanded_inputs[fromIdx:] :
# make sure that we do not add the same shared variable twice
if isinstance(input.variable, SharedVariable) and not input.update: if isinstance(input.variable, SharedVariable) and not input.update:
shared_non_seqs += [input.variable] shared_non_seqs += [input.variable]
inner_fn_inputs += [input.variable.type() ] inner_fn_inputs += [input.variable.type() ]
if slice_to_seqs: slice_to_seqs += [ n_extended_outs]
val = slice_to_seqs[-1]
else: val = -1
slice_to_seqs += [val +1]
givens[input.variable] = inner_fn_inputs[-1] givens[input.variable] = inner_fn_inputs[-1]
elif not isinstance(input.variable, SharedVariable): elif not isinstance(input.variable, SharedVariable):
# also add the normal tensor that are non sequences at the
# end of the inputs intertwingled with the shared variables
inner_fn_inputs.append(input.variable) inner_fn_inputs.append(input.variable)
if type(n_steps) in (float,int):
n_fixed_steps = int(n_steps)
else:
# check if it is actually a Theano constant
try :
n_fixed_steps = opt.get_constant_value(n_steps)
except:
n_fixed_steps = None
# If we haven't provided a number of steps nor did we provide a sequence
# scan will not know how long to iterate
if (n_steps == None or n_steps == numpy.inf or n_steps == numpy.nan) and n_seqs == 0 : if (n_steps == None or n_steps == numpy.inf or n_steps == numpy.nan) and n_seqs == 0 :
raise ValueError('Scan does not know for how many steps to iterate. ' raise ValueError('Scan does not know for how many steps to iterate. '
'You need to provide the number of steps through the ' 'You need to provide the number of steps through the '
' ``n_steps`` argument if you do not iterate over any sequence') ' ``n_steps`` argument if you do not iterate over any sequence')
# Create the Scan op object # We can now create the Scan Op Object
local_op = Scan( (inner_fn_inputs,inner_fn_out_states, givens, slice_to_seqs ), n_seqs,
n_extended_outs, inplace_map, sequences_taps, outputs_taps, truncate_gradient, if n_fixed_steps not in [1,-1]:
go_backwards, store_steps, mode, n_fixed_steps = n_fixed_steps, name = name)
if n_steps != None:
n_steps = tensor.as_tensor(n_steps)
else:
n_steps = gof.Constant(gof.generic, 'unknown', '?_steps')
local_op = Scan( (inner_fn_inputs,inner_fn_outs, givens, slice_to_seqs ), n_seqs,
n_extended_outs, inplace_map, sequences_taps, outputs_taps, n_steps,truncate_gradient,
# n_outs, inner_fn_notshared_ins_idx and inner_fn_shared_ins_idx are used by the gradient
# to figure out where in the input are shared variables with updates, for whom I can't compute
# a gradient
n_outs, inner_fn_notshared_ins_idx, inner_fn_shared_ins_idx,
go_backwards, store_steps, return_steps, mode, name = name )
# Call the object on the input sequences, initial values for outs, # Call the object on the input sequences, initial values for outs,
# and non sequences # and non sequences
...@@ -636,12 +829,6 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -636,12 +829,6 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
unwrapped_seqs = [ seq.get('input',tensor.as_tensor(0.)) for seq in seqs ] unwrapped_seqs = [ seq.get('input',tensor.as_tensor(0.)) for seq in seqs ]
unwrapped_outs = [ out.get('initial',tensor.as_tensor(0.)) for out in outs_info ] unwrapped_outs = [ out.get('initial',tensor.as_tensor(0.)) for out in outs_info ]
if n_steps != None:
n_steps = tensor.as_tensor(n_steps)
else:
#n_steps = tensor.constant(numpy.inf,'?_steps')
n_steps = gof.Constant(gof.generic, 'unknown', '?_steps')
values = local_op( *( [n_steps] values = local_op( *( [n_steps]
+ unwrapped_seqs + unwrapped_seqs
+ unwrapped_outs + unwrapped_outs
...@@ -649,16 +836,65 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[], ...@@ -649,16 +836,65 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
+ notshared_other_args + notshared_other_args
+ shared_non_seqs)) + shared_non_seqs))
else:
# If we do not actually need scan
for pos, inner_out in enumerate(inner_fn_outs):
# check if we are suppose to return just the last step
# we treat this case differently because the tensor we return
# in this case is different (it has one dimension less)
if return_steps.has_key(pos):
if return_steps[pos] != 1:
# if we return more then one step, we need to add
# one more dimension to our output and make it
# unbroadcastable
inner_fn_outs[pos] = tensor.unbroadcast(
tensor.shape_padleft(inner_out),0)
else:
# same if we do not have any information about how many
# steps we should return (to read return everything in this
# case
inner_fn_outs[pos] = tensor.unbroadcast(
tensor.shape_padleft(inner_out),0)
values = inner_fn_outs
if not type(values) in (tuple, list): if not type(values) in (tuple, list):
values = [values] values = [values]
# take out the updates of shared variable and build the dictionary
# that tells what to update and with what value
for val in update_map.keys(): for val in update_map.keys():
update_map[val] = values [ update_map[val] ] update_map[val] = values [ update_map[val] ]
# Now we need to check the values returned
# if it just one strip the list around it
if n_outs == 1: if n_outs == 1:
# if we need to return just one step or several steps
# note that when we return one step we have two cases, in
# the first one store_steps is set to 1, case in which we don't
# need to take a slice of the output (is already of the right
# dimension) and case 2 when we store more then one step,
# and we actually need to take a slice
if return_steps.has_key(0):
if return_steps[0] > 1:
values = values[0][-return_steps[0]:]
else:
if store_steps[0] == 1:
values = values[0]
else:
values = values[0][-1]
else:
values = values[0] values = values[0]
else: else:
values = values[:n_outs] values = values[:n_outs]
for idx,val in enumerate(values):
if return_steps.has_key(idx):
if return_steps[idx] > 1:
values[idx] = val[-return_steps[idx]:]
else:
if store_steps[idx] == 1:
values[idx] = val
else:
values[idx] = val[-1]
return (values, update_map) return (values, update_map)
...@@ -669,10 +905,11 @@ class Scan(Op): ...@@ -669,10 +905,11 @@ class Scan(Op):
def __init__(self,(inputs, outputs, givens, slice_to_seqs),n_seqs, n_outs, def __init__(self,(inputs, outputs, givens, slice_to_seqs),n_seqs, n_outs,
inplace_map={}, seqs_taps={}, outs_taps={}, inplace_map={}, seqs_taps={}, outs_taps={},
truncate_gradient = -1, n_steps = gof.Constant(gof.generic, 'unknown', '?_steps'),
truncate_gradient = -1, n_outs_not_shared =0,
inner_fn_start_shared = 0, inner_fn_end_shared = 0,
go_backwards = False, store_steps = {}, go_backwards = False, store_steps = {},
mode = None, n_fixed_steps = None, inplace=False, return_steps={}, mode = None, inplace=False, name = None):
name = None):
''' '''
:param (inputs,outputs, givens,slice_to_seqs): :param (inputs,outputs, givens,slice_to_seqs):
inputs and outputs Theano variables that describe the function that is inputs and outputs Theano variables that describe the function that is
...@@ -694,27 +931,9 @@ class Scan(Op): ...@@ -694,27 +931,9 @@ class Scan(Op):
steps (from the end towards the begining) of the outputs you really need and should steps (from the end towards the begining) of the outputs you really need and should
return; given this information, scan can know (if possible) to allocate only return; given this information, scan can know (if possible) to allocate only
the amount of memory needed to compute that many entries the amount of memory needed to compute that many entries
:param n_fixed_steps: this is a number if n_steps in the scan function
received a number or None otherwise. The value is used to optimize
the graph, since a scan that has n_steps fixed to 1 or 0 is not
really needed in the graph. (? could we use tag hints ?)
:param name: see scan fct :param name: see scan fct
:param mode: see scan fct :param mode: see scan fct
''' '''
#check sequences past taps
for k,v in seqs_taps.iteritems():
if k > n_seqs:
raise ValueError(('Sequences past taps dictionary reffers to '
'an unexisting sequence %d')%k)
#check outputs past taps
for k,v in outs_taps.iteritems():
if k > n_outs:
raise ValueError(('Output past taps dictionary reffers to '
'an unexisting sequence %d')%k)
if v and (max(v) > -1):
raise ValueError(('Can not require future value %d of output' \
' %d')%(k,max(v)))
# build a list of output types for any Apply node using this op. # build a list of output types for any Apply node using this op.
self.apply_output_types = [] self.apply_output_types = []
for i, o in enumerate(outputs): for i, o in enumerate(outputs):
...@@ -726,10 +945,11 @@ class Scan(Op): ...@@ -726,10 +945,11 @@ class Scan(Op):
dtype=o.type.dtype) dtype=o.type.dtype)
self.apply_output_types.append(expanded_otype) self.apply_output_types.append(expanded_otype)
self.destroy_map = {} self.destroy_map = {}
if inplace: if inplace:
for i in inplace_map.keys(): for i in inplace_map.keys():
# the n_steps is always the first argument of scan's perform,
# so we need to shift everything by 1
self.destroy_map.update({i: [inplace_map[i]+1] } ) self.destroy_map.update({i: [inplace_map[i]+1] } )
# make all inplace inputs mutable for the inner function for extra efficency # make all inplace inputs mutable for the inner function for extra efficency
for idx in xrange(len(inputs)): for idx in xrange(len(inputs)):
...@@ -750,13 +970,17 @@ class Scan(Op): ...@@ -750,13 +970,17 @@ class Scan(Op):
self.store_steps = store_steps self.store_steps = store_steps
self.inplace = inplace self.inplace = inplace
self.inputs = inputs self.inputs = inputs
self.return_steps = return_steps
self.givens = givens self.givens = givens
self.n_outs_not_shared = n_outs_not_shared
self.inner_fn_start_shared = inner_fn_start_shared
self.inner_fn_end_shared = inner_fn_end_shared
self.outputs = outputs self.outputs = outputs
self.n_steps = n_steps # It will be computed at runtime
# This is here just for an optimization to be able to pick up if # This is here just for an optimization to be able to pick up if
# scan is really needed in the graph; if the number of steps # scan is really needed in the graph; if the number of steps
# scan does is a constant of 1, -1 or 0 then we can remove scan # scan does is a constant of 1, -1 or 0 then we can remove scan
# from the graph # from the graph
self.n_fixed_steps = n_fixed_steps
self.mode = mode self.mode = mode
self.truncate_gradient = truncate_gradient self.truncate_gradient = truncate_gradient
self.go_backwards = go_backwards self.go_backwards = go_backwards
...@@ -777,6 +1001,8 @@ class Scan(Op): ...@@ -777,6 +1001,8 @@ class Scan(Op):
if name is None: name = 'scan_fn' if name is None: name = 'scan_fn'
self.fn = function(inputs,outputs, mode = mode_instance, givens = givens, self.fn = function(inputs,outputs, mode = mode_instance, givens = givens,
name = name) name = name)
# asert that we don't have shasred variables anymore ( we replaced them
# with non shared versions)
assert not numpy.any([isinstance(x.variable,SharedVariable) for x in assert not numpy.any([isinstance(x.variable,SharedVariable) for x in
self.fn.maker.inputs]) self.fn.maker.inputs])
...@@ -799,13 +1025,16 @@ class Scan(Op): ...@@ -799,13 +1025,16 @@ class Scan(Op):
(self.seqs_taps == other.seqs_taps) and \ (self.seqs_taps == other.seqs_taps) and \
(self.outs_taps == other.outs_taps) and \ (self.outs_taps == other.outs_taps) and \
(self.inplace_map == other.inplace_map) and \ (self.inplace_map == other.inplace_map) and \
(self.return_steps == other.return_steps) and \
(self.n_outs_not_shared == other.n_outs_not_shared) and \
(self.inner_fn_start_shared == other.inner_fn_start_shared) and\
(self.inner_fn_end_shared == other.inner_fn_end_shared) and \
(self.mode == other.mode) and \ (self.mode == other.mode) and \
(self.n_seqs == other.n_seqs) and\ (self.n_seqs == other.n_seqs) and\
(self.inplace == other.inplace) and\ (self.inplace == other.inplace) and\
(self.go_backwards == other.go_backwards) and\ (self.go_backwards == other.go_backwards) and\
(self.truncate_gradient == other.truncate_gradient) and\ (self.truncate_gradient == other.truncate_gradient) and\
(self.n_outs == other.n_outs) and\ (self.n_outs == other.n_outs) and\
(self.n_fixed_steps == other.n_fixed_steps) and\
(self.n_args == other.n_args) (self.n_args == other.n_args)
return rval return rval
...@@ -816,17 +1045,20 @@ class Scan(Op): ...@@ -816,17 +1045,20 @@ class Scan(Op):
return hash(type(self)) ^ \ return hash(type(self)) ^ \
hash(self.n_seqs) ^ \ hash(self.n_seqs) ^ \
hash(self.n_outs) ^ \ hash(self.n_outs) ^ \
hash(self.n_outs_not_shared) ^ \
hash(self.inner_fn_start_shared) ^\
hash(self.inner_fn_end_shared) ^\
hash(self.inplace) ^\ hash(self.inplace) ^\
hash(self.go_backwards) ^\ hash(self.go_backwards) ^\
hash(self.truncate_gradient) ^\ hash(self.truncate_gradient) ^\
hash(self.n_args) ^ \ hash(self.n_args) ^ \
hash(self.mode) ^\ hash(self.mode) ^\
hash(self.n_fixed_steps) ^\
hash_listsDictsTuples(self.outputs) ^ \ hash_listsDictsTuples(self.outputs) ^ \
hash_listsDictsTuples(self.inputs) ^ \ hash_listsDictsTuples(self.inputs) ^ \
hash_listsDictsTuples(self.givens) ^ \ hash_listsDictsTuples(self.givens) ^ \
hash_listsDictsTuples(self.seqs_taps) ^\ hash_listsDictsTuples(self.seqs_taps) ^\
hash_listsDictsTuples(self.outs_taps) ^\ hash_listsDictsTuples(self.outs_taps) ^\
hash_listsDictsTuples(self.return_steps) ^\
hash_listsDictsTuples(self.store_steps) hash_listsDictsTuples(self.store_steps)
...@@ -1075,90 +1307,115 @@ class Scan(Op): ...@@ -1075,90 +1307,115 @@ class Scan(Op):
except: except:
y[j] = numpy.empty( (self.store_steps[j],)+something[j].shape, \ y[j] = numpy.empty( (self.store_steps[j],)+something[j].shape, \
dtype = something[j].dtype) dtype = something[j].dtype)
y[j][idx_sotre_steps[j]] = something[j] y[j][idx_store_steps[j]] = something[j]
self.idx_store_steps[j] = (self.idx_store_steps[j] + 1) % self.store_steps[j] self.idx_store_steps[j] = (self.idx_store_steps[j] + 1) % self.store_steps[j]
return y return y
def grad(self, args, g_outs): def grad(self, args, g_outs):
raise NotImplementedError('This will be implemented in the near future'); # forward pass - get the outputs after applying scan
''' scan_outputs = self(*args)
if True: # make sure they are given as a list
#((self.updates.keys() != []) or (self.inplace_map.keys() != [])\ if not( type(scan_outputs) in (list,tuple)):
# or numpy.any(self.store_steps)): scan_outputs = [scan_outputs]
# warning('Can not compute gradients if inplace or updates ' \ # get a list of clean inputs ( against which one can compute
# 'are used or if you do not keep past value of outputs.'\ # gradients ) [ everything except shared variables with updates ]
# 'Use force_gradient if you know for sure '\ clean_inputs = self.inputs[:self.inner_fn_start_shared] + \
# 'that the gradient can be computed automatically.') self.inputs[self.inner_fn_start_shared + \
warning('Gradient not fully tested yet !') self.inner_fn_end_shared:]
return [None for i in args] # function that computes the gradient (we sum over the gradients
else: # with respect to all outputs
# forward pass
y = self(*args)
if not( type(y) in (list,tuple)):
y = [y]
g_y = [outputs[0].type()]
def compute_gradient(y, g_y): def compute_gradient(y, g_y):
gmap = gradient.grad_sources_inputs( \ gmap = gradient.grad_sources_inputs( \
[(y,g_y)], gof.graph.inputs([y]), False) [(y,g_y)], clean_inputs, False)
def zero(p): def zero(p):
try:
use_dtype = p.type.dtype
except:
use_dtype = theano.config.floatX
return tensor.TensorConstant(tensor.TensorType(\ return tensor.TensorConstant(tensor.TensorType(\
dtype=p.type.dtype, broadcastable=[]), dtype=use_dtype, broadcastable=[]),
safe_asarray._asarray(0,dtype = p.type.dtype)) safe_asarray._asarray(0,dtype = use_dtype))
return [gmap.get(p, zero(p)) for p in inputs] return [gmap.get(p, zero(p)) for p in self.inputs]
i = 0 # this are g_outs for the inner function (that computes the gradients)
while inner_g_outs = []
g_args = compute_gradient( outputs[0], g_y[-1]) # the outs of the gradient computting inner function
# for all outputs compute gradients and then sum them up inner_gfn_outs = []
for y in outputs[1:]: inner_gfn_ins = []
g_y += [y.type()] # Go through the outputs that don't represent update rules
g_args_y = compute_gradient( y,g_y[-1]) for out in self.outputs[:self.n_outs_not_shared]:
for i in xrange(len(g_args)): inner_g_out = out.type()
g_args[i] += g_args_y[i] if out.name:
# for debugging add names to all variables I'm creating
g_y.name = 'g_'+out.name
inner_g_outs.append(inner_g_out)
_grad_outs = compute_gradient(out, inner_g_out)
grad_outs = _grad_outs[:self.n_seqs+self.n_outs_not_shared] + \
_grad_outs[self.n_seqs+self.n_outs:]
if not inner_gfn_outs :
inner_gfn_outs = grad_outs
else:
# safety check, some of this inputs might still not be differentiable,
# for those we don't add them to the mix (assume their gradient is 0)
for i,(x,y) in enumerate(zip(grad_outs, inner_gfn_outs)):
if x and y:
inner_gfn_outs[i] = x+y
elif y:
inner_gfn_outs[i] = y
else:
inner_gfn_outs[i] = x
self.g_ins = g_y+inputs
self.g_outs = g_args
# backwards pass # backwards pass
for i in xrange(len(y)): for i in xrange(len(inner_gfn_outs)):
if inner_gfn_outs[i] == None:
inner_gfn_outs[i] = tensor.zeros_like(clean_inputs[i])
for i in xrange(self.n_outs_not_shared):
# Safety check
if g_outs[i] == None: if g_outs[i] == None:
g_outs[i] = tensor.zeros_like(y[i]) try:
# this try is for catching non ndarray inputs (random states)
g_args = [self.n_steps]+g_outs + y # it is more of a safety check ( all random states should be
# check if go_backwards is true # after n_outs_not_shared ...
if self.go_backwards: g_outs[i] = tensor.zeros_like(scan_outputs[i])
for seq in args[1:self.n_seqs]: except:
g_args += [seq[::-1]] g_outs[i] = theano.tensor.constant(numpy.array(0,dtype=theano.config.floatX))
else: inner_gfn_ins = inner_g_outs + self.inputs
g_args += args[1:self.n_seqs]
g_args += args[1+self.n_seqs: ]
g_args = [self.n_steps] + g_outs[:self.n_outs_not_shared] \
+ scan_outputs + args[1:]
g_scan = ScanGrad((self.g_ins,self.g_outs), self.n_seqs, \ g_scan = ScanGrad((inner_gfn_ins, inner_gfn_outs),
self.n_outs,self.seqs_taps, self.outs_taps, self.n_seqs, self.n_outs, self.n_outs_not_shared,
self.go_backwards, self.seqs_taps, self.outs_taps,
self.truncate_gradient) self.truncate_gradient)
g_scan_outs = g_scan(g_args)
return g_scan(g_args) # We need to add several None's fpr shared vars with updates
''' gradients = [None] + g_scan_outs[:self.n_seqs+self.n_outs_not_shared]
gradients += [None for i in xrange(self.n_outs-self.n_outs_not_shared)]
gradients += g_scan_outs[self.n_seqs+self.n_outs_not_shared:]
return gradients
'''
class ScanGrad(Op): class ScanGrad(Op):
"""Gradient Op for Scan""" """Gradient Op for Scan"""
def __init__(self,(g_ins, g_outs) , n_seqs, n_outs, def __init__(self,(g_ins, g_outs) , n_seqs, n_outs,
seqs_taps = {}, outs_taps= {}, truncate_gradient = -1): n_outs_not_shared,
go_backwards = False, seqs_taps = {}, outs_taps= {},
truncate_gradient = -1):
self.grad_fn = function(g_ins, g_outs) self.grad_fn = function(g_ins, g_outs)
self.inputs = g_ins self.inputs = g_ins
self.outputs = g_outs self.outputs = g_outs
self.n_outs_not_shared = n_outs_not_shared
self.n_seqs = n_seqs self.n_seqs = n_seqs
self.go_backwards = go_backwards
self.truncate_gradient = truncate_gradient self.truncate_gradient = truncate_gradient
self.n_outs = n_outs self.n_outs = n_outs
self.seqs_taps = seqs_taps self.seqs_taps = seqs_taps
...@@ -1173,6 +1430,8 @@ class ScanGrad(Op): ...@@ -1173,6 +1430,8 @@ class ScanGrad(Op):
(self.outputs == other.outputs) and \ (self.outputs == other.outputs) and \
(self.n_seqs == other.n_seqs) and \ (self.n_seqs == other.n_seqs) and \
(self.n_outs == other.n_outs) and \ (self.n_outs == other.n_outs) and \
(self.go_backwards == other.go_backwards) and \
(self.n_outs_not_shared == other.n_outs_not_shared) and\
(self.truncate_gradient == other.truncate_gradient) and\ (self.truncate_gradient == other.truncate_gradient) and\
(self.seqs_taps == other.seqs_taps) and \ (self.seqs_taps == other.seqs_taps) and \
(self.outs_taps == other.outs_taps) (self.outs_taps == other.outs_taps)
...@@ -1182,11 +1441,12 @@ class ScanGrad(Op): ...@@ -1182,11 +1441,12 @@ class ScanGrad(Op):
return hash(type(self)) ^ \ return hash(type(self)) ^ \
hash(self.n_seqs) ^ \ hash(self.n_seqs) ^ \
hash(self.n_outs) ^ \ hash(self.n_outs) ^ \
hash(self.go_backwards) ^\
hash(self.truncate_gradient) ^\ hash(self.truncate_gradient) ^\
hash_list(self.inputs) ^ \ hash_listsDictsTuples(self.inputs) ^ \
hash_list(self.outputs) ^ \ hash_listsDictsTuples(self.outputs) ^ \
hash_dict(self.seqs_taps) ^ \ hash_listsDictsTuples(self.seqs_taps) ^ \
hash_dict(self.outs_taps) hash_listsDictsTuples(self.outs_taps)
def make_node(self, *args): def make_node(self, *args):
# input of the gradient op : # input of the gradient op :
...@@ -1195,36 +1455,96 @@ class ScanGrad(Op): ...@@ -1195,36 +1455,96 @@ class ScanGrad(Op):
# return # return
# | grad of seqs | grad of outs | grad of non_seqs | # | grad of seqs | grad of outs | grad of non_seqs |
# | n_seqs | n_outs | unknown | # | n_seqs | n_outs | unknown |
return Apply(self, list(args),
[i.type() for i in args[1+2*self.n_outs:] ]) scan_inputs = args[0][1+self.n_outs_not_shared+self.n_outs:]
outputs_grad = scan_inputs[:self.n_seqs+self.n_outs_not_shared]
outputs_grad += scan_inputs[self.n_seqs+self.n_outs:]
return Apply(self, list(args[0]),
[i.type() for i in outputs_grad ])
def perform(self, node, args, storage): def perform(self, node, args, storage):
# get scan inputs # get scan inputs
n_steps = args[0] n_steps = args[0]
inputs = args[2*self.n_outs+1:]
if n_steps != 'unknown':
n_steps = int(n_steps)
if n_steps < 0:
n_steps = abs(n_steps)
go_backwards = not self.go_backwards
else:
go_backwards = self.go_backwards
else:
n_steps = None
go_backwards = self.go_backwards
inputs = args[self.n_outs_not_shared+self.n_outs+1:]
seqs = inputs[:self.n_seqs] seqs = inputs[:self.n_seqs]
seeds = inputs[self.n_seqs:self.n_seqs+self.n_outs] outInfo = inputs[self.n_seqs:self.n_seqs+self.n_outs]
non_seqs = inputs[self.n_outs+self.n_seqs:] non_seqs = inputs[self.n_outs+self.n_seqs:]
# generate space for gradient if (self.n_seqs == 0 ) and (not numpy.isfinite(n_steps) ):
g_seqs = [numpy.zeros_like(k) for k in seqs] raise ValueError('Scan does not know how many steps it '
g_seeds = [numpy.zeros_like(k) for k in seeds] 'should iterate! Either provide some input sequences from '
g_non_seqs = [numpy.zeros_like(k) for k in non_seqs] 'which scan could find out the number of steps, or directly'
# get gradient from above 'the number of steps you want through the n_steps argument.')
g_outs = args[:self.n_outs]
# get the output of the scan operation for i in xrange(self.n_seqs):
outs = args[self.n_outs:2*self.n_outs] if self.seqs_taps.has_key(i):
# compute actual length of the sequence ( we need to see what
# past taps this sequence has, and leave room for them
seq_len = seqs[i].shape[0] + min(self.seqs_taps[i])
if max( self.seqs_taps[i]) > 0:
# using future values, so need to end the sequence earlier
seq_len -= max(self.seqs_taps[i])
if n_steps == None :
# length of the sequences, leaving room for the largest
n_steps = seq_len
if seq_len != n_steps :
if seq_len > n_steps:
warning('Input sequence is longer then required. '
'Extra values will be ignored')
else:
warning(' Input sequence is shorter then the number '
'of steps scan was suppose to do. Readjusting'
'the number of steps scan will iterate ... ')
n_steps = min(seq_len,n_steps)
# go back through time to 0 or n_steps - truncate_gradient # go back through time to 0 or n_steps - truncate_gradient
lower_limit = n_steps - self.truncate_gradient lower_limit = n_steps - self.truncate_gradient
length = n_steps
if lower_limit > n_steps-1: if lower_limit > n_steps-1:
the_range = xrange(n_steps-1,-1,-1) the_range = xrange(n_steps-1,-1,-1)
lower_limit = 0
elif lower_limit < -1: elif lower_limit < -1:
the_range = xrange(n_steps-1,-1,-1) the_range = xrange(n_steps-1,-1,-1)
lower_limit = 0
else:
the_range = xrange(n_steps-1, lower_limit-1,-1)
lower_limit = lower_limit + 1
# generate space for gradient
if lower_limit != 0 :
length = len(the_range)
g_seqs = []
# Check for taps ==> you need to enlarge the sequence length
for j in xrange(self.n_seqs):
if self.seqs_taps.has_key(j):
length = length - min(self.seqs_taps[j])
length = length + max(self.seqs_taps[j])
g_seqs += [ numpy.zeros_like(seqs[j][:length]) ]
else: else:
the_range = xrange(n_steps-1, lower_limit,-1) g_seqs = [numpy.zeros_like(k) for k in seqs]
g_outInfo = [numpy.zeros_like(k) \
for k in outInfo[:self.n_outs_not_shared]]
g_non_seqs = [numpy.zeros_like(k) for k in non_seqs]
# get gradient on the outputs
g_outs = args[1:self.n_outs_not_shared+1]
# get the output of the scan operation
outs = args[1+self.n_outs_not_shared:self.n_outs_not_shared+self.n_outs+1]
...@@ -1234,46 +1554,62 @@ class ScanGrad(Op): ...@@ -1234,46 +1554,62 @@ class ScanGrad(Op):
seqs_mins.update({j: min(self.seqs_taps[j])}) seqs_mins.update({j: min(self.seqs_taps[j])})
outs_mins = {} outs_mins = {}
seed_size = {} initOuts_size = {}
for j in xrange(self.n_outs): for j in xrange(self.n_outs):
if self.outs_taps.has_key(j): if j >= self.n_outs_not_shared:
outs_mins.update({j:-1})
initOuts_size.update({j:0})
elif self.outs_taps.has_key(j):
outs_mins.update({j: min(self.outs_taps[j])}) outs_mins.update({j: min(self.outs_taps[j])})
seed_size.update({j: g_seeds[j].shape[0]}) if self.outs_taps[j] != [-1]:
initOuts_size.update({j:g_outInfo[j].shape[0]})
else:
initOuts_size.update({j:0})
for i in the_range: for i in the_range:
# time slice of inputs # time slice of inputs
_ins = [] _ins = []
_i = i
if go_backwards:
_i = n_steps -1 -i
for j in xrange(self.n_seqs): for j in xrange(self.n_seqs):
if self.seqs_taps.has_key(j): if self.seqs_taps.has_key(j):
ls_taps = self.seqs_taps[j] ls_taps = self.seqs_taps[j]
min_tap = seqs_mins[j] min_tap = seqs_mins[j]
for tap_value in ls_taps: for tap_value in ls_taps:
k = i - min_tap + tap_value k = _i - min_tap + tap_value
_ins += [ins[j][k]] _ins += [seqs[j][k]]
# time slice of outputs + taps # time slice of outputs + taps
_outs = [] _outs = []
for j in xrange(self.n_outs): for j in xrange(self.n_outs):
if self.outs_taps.has_key(j): if self.outs_taps.has_key(j):
ls_taps = self.outs_taps[j] ls_taps = self.outs_taps[j]
min_tap = outs_mins[j] min_tap = outs_mins[j]
seed_sz = seed_size[j] seed_sz = initOuts_size[j]
for tap_value in ls_taps: for tap_value in ls_taps:
if i + tap_value < 0: if i + tap_value < 0:
if seed_sz < 1:
_outs += [outInfo[j]]
else:
k = i + seed_sz + tap_value k = i + seed_sz + tap_value
if k < 0 : if k < 0 :
#past value not provided .. issue a warning and use 0 #past value not provided .. issue a warning and use 0
_outs += [numpy.zeros(seeds[j][0].shape)] _outs += [numpy.zeros(outInfo[j][0].shape)]
warning('Past value %d for output $d not given' \ warning('Past value %d for output $d not given' \
%(j,tap_value)) %(j,tap_value))
else: else:
_outs += [seeds[j][k]] _outs += [outInfo[j][k]]
else:
if j>= self.n_outs_not_shared:
_outs += [outs[j] ]
else: else:
_outs += [outs[j][i + tap_value]] _outs += [outs[j][i + tap_value]]
g_out = []
g_out = [arg[i] for arg in g_outs] g_out = [ arg[i] for arg in g_outs]
grad_args = g_out + _ins + _outs + non_seqs grad_args = g_out + _ins + _outs + non_seqs
grads=self.grad_fn(*grad_args) grads=self.grad_fn(*grad_args)
# get gradient for inputs # get gradient for inputs
pos = 0 pos = 0
for j in xrange(self.n_seqs): for j in xrange(self.n_seqs):
...@@ -1281,22 +1617,29 @@ class ScanGrad(Op): ...@@ -1281,22 +1617,29 @@ class ScanGrad(Op):
ls_taps = self.seqs_taps[j] ls_taps = self.seqs_taps[j]
min_tap = seqs_mins[j] min_tap = seqs_mins[j]
for tap_value in ls_taps : for tap_value in ls_taps :
k = i - min_tap + tap_value k = _i - min_tap + tap_value
g_ins[j][k] += grads[pos] print k, lower_limit, k-lower_limit
print g_seqs[j].shape
g_seqs[j][k-lower_limit] += grads[pos]
pos += 1 pos += 1
# get gradient for outputs # get gradient for outputs
for j in xrange(self.n_outs): for j in xrange(self.n_outs_not_shared):
if self.outs_taps.has_key(j): if self.outs_taps.has_key(j):
ls_taps = self.outs_taps[j] ls_taps = self.outs_taps[j]
min_tap = outs_mins[j] min_tap = outs_mins[j]
seed_sz = seed_size[j] seed_sz = initOuts_size[j]
for tap_value in ls_taps: for tap_value in ls_taps:
if i+tap_value < 0 : if i+tap_value < 0 :
k = i + seed_sz + tap_value k = i + seed_sz + tap_value
if k > 0 : if k >= 0 :
g_seeds[j][k] += grads[pos] g_outInfo[j][k] += grads[pos]
else:
g_outInfo[j] += grads[pos]
else:
g_outs[j][i+tap_value] += grads[pos]
pos += 1 pos += 1
for j in xrange(len(g_non_seqs)): for j in xrange(len(g_non_seqs)):
g_non_seqs[j] += grads[j+pos] g_non_seqs[j] += grads[j+pos]
...@@ -1304,9 +1647,8 @@ class ScanGrad(Op): ...@@ -1304,9 +1647,8 @@ class ScanGrad(Op):
# return the gradient # return the gradient
for i,v in enumerate(g_ins + g_seeds+ g_non_seqs): for i,v in enumerate(g_seqs + g_outInfo+ g_non_seqs):
storage[i][0] = v storage[i][0] = v
'''
...@@ -1369,9 +1711,10 @@ class ScanSpaceOptimizer(Optimizer): ...@@ -1369,9 +1711,10 @@ class ScanSpaceOptimizer(Optimizer):
if numpy.any(store_steps!= op.store_steps): if numpy.any(store_steps!= op.store_steps):
new_scan = Scan((op.inputs, op.outputs, op.givens, new_scan = Scan((op.inputs, op.outputs, op.givens,
op.slice_to_seqs),op.n_seqs, op.n_outs, op.slice_to_seqs),op.n_seqs, op.n_outs,
op.inplace_map, op.seqs_taps, op.outs_taps, op.inplace_map, op.seqs_taps, op.outs_taps, op.n_steps,
op.truncate_gradient, op.go_backwards, op.truncate_gradient, op.n_outs_not_shared, op.inner_fn_start_shared,
store_steps, op.mode,op.n_fixed_steps, op.inner_fn_end_shared, op.go_backwards,
store_steps, op.return_steps, op.mode,
op.inplace, name = op.fn.name).make_node(*node.inputs) op.inplace, name = op.fn.name).make_node(*node.inputs)
# we not need to replace the outputs of scan # we not need to replace the outputs of scan
for i,out in enumerate(node.outputs): for i,out in enumerate(node.outputs):
...@@ -1397,9 +1740,10 @@ def scan_make_inplace(node): ...@@ -1397,9 +1740,10 @@ def scan_make_inplace(node):
op = node.op op = node.op
if isinstance(op, Scan) and (not op.inplace) and (op.inplace_map.keys() != []): if isinstance(op, Scan) and (not op.inplace) and (op.inplace_map.keys() != []):
return Scan((op.inputs, op.outputs, op.givens, op.slice_to_seqs ) , op.n_seqs, return Scan((op.inputs, op.outputs, op.givens, op.slice_to_seqs ) , op.n_seqs,
op.n_outs, op.inplace_map, op.seqs_taps, op.outs_taps, op.n_outs, op.inplace_map, op.seqs_taps, op.outs_taps, op.n_steps,
op.truncate_gradient, op.go_backwards, op.store_steps, op.mode, op.truncate_gradient, op.n_outs_not_shared, op.inner_fn_start_shared,
op.n_fixed_steps, inplace=True, name = op.fn.name).make_node(*node.inputs).outputs op.inner_fn_end_shared, op.go_backwards, op.store_steps, op.return_steps, op.mode,
inplace=True, name = op.fn.name).make_node(*node.inputs).outputs
return False return False
...@@ -1407,138 +1751,4 @@ optdb.register('scanOp_make_inplace', opt.in2out(scan_make_inplace, ...@@ -1407,138 +1751,4 @@ optdb.register('scanOp_make_inplace', opt.in2out(scan_make_inplace,
ignore_newtrees=True), 75, 'fast_run', 'inplace') ignore_newtrees=True), 75, 'fast_run', 'inplace')
class ScanRemoveFromGraph(Optimizer):
''' Graph Optmizer that removes scan if you just do a loop of 1 '''
def __init__(self):
Optimizer.__init__(self)
def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate())
def apply(self,env):
nodelist = list(env.toposort())
for node in nodelist:
op = node.op
# If it is a scan Op
if isinstance(op, Scan) and op.n_fixed_steps != None:
if abs(op.n_fixed_steps) < 2:
# Step 1 replace the inputs of the inner function
# with the inputs of scan
# Start replacing
# idx_curr_inp -> index that goes through the extended
# inputs of the op (includes shared variables) that are
# not provided to the node as inputs !!
idx_curr_inp = -1
# keeps track of what slice of the current input we are
# currently dealing with
slice = -1
# keeps track of the index that goes through the actual
# inputs of the node
idx_node_inp = 0
# pairs of variables that we need to replace in the end
replace_pairs = {}
# go through the inputs of the inner function
for i,inp in enumerate(op.inputs):
# figure what what slice of what node input this represents
if i < len(op.slice_to_seqs):
# slice_to_seqs is an array of the form [1 1 2 3 3 3 ],
# meaning that the 1st input of the inner function is a
# slice of the 1st input of scan, 2nd input of the inner
# function is a slice of the 1st input of scan and so on..
arg = op.slice_to_seqs[i]
# check if this is a slice of the current input
if arg == idx_curr_inp:
# if so increase the number of the current slice
slice+= 1
else:
# if not reset slice, make this the new current
# input
slice = 0
idx_curr_inp = arg
# and check if it is a shared variables
# scan deals with shared variables by replacing them
# with copies using the given argument of theano.function
# so if we have a shared variable it should appear in
# op.givens !!
if inp not in op.givens:
# if it is not a shared variable increase the index
# of the current input
# note that we will jump to 1; this is fine since
# node.inputs[0] is the number of steps, which we
# should not consider here .. we care of what follows
# namely the sequences, initial states, non sequences...
idx_node_inp += 1
if inp not in op.givens:
# This is not a shared variable so we can replace it
# ( we should not replace the shared variables, theano.function
# will take care of shared variables here ..)
if idx_curr_inp >= op.n_seqs:
# we are dealing with a initial state of some output
# check if we are dealing with a 1 past tap output
one_step = False
if not op.outs_taps.has_key(idx_curr_inp-op.n_seqs):
one_step = True
else:
if op.outs_taps[idx_curr_inp - op.n_seqs] == [-1]:
one_step = True
if one_step:
node_input = node.inputs[idx_node_inp]
else:
tap = op.outs_taps[idx_curr_inp-op.n_seqs][slice]
min_tap = min(op.outs_taps[idx_curr_inp-op.n_seqs])
node_input = node.inputs[idx_node_inp][tap-min_tap]
else:
# we are dealing with a slice of a sequence
tap = op.seqs_taps[idx_curr_inp][slice]
min_tap = min(op.seqs_taps[idx_curr_inp])
node_input = node.inputs[idx_node_inp][tap-min_tap]
# add to our replace_pairs list
replace_pairs[inp] = node_input
else:
# if we got here this means we are dealing with non_sequences,
# which do not have slices !
# check to see if we are dealing with a shared variable
if inp not in op.givens:
idx_node_inp += 1
replace_pairs[inp] = node.inputs[idx_node_inp]
def my_replace( node, replace_pairs):
# Turns out that using env replace (while safe) is
# a real pain because of many condition that have to
# be met which I can not met while doing the
# replacement, so I did my little hack that does
# something like a replacement
# ASSUMPTIONS:
# we do not do anything crazy like replacing x
# with something in terms of x !
#
# we do not have envs or anything, just a simple
# computational graph that has not been compiled
# yet
if node:
for i,inp in enumerate(node.inputs):
if inp in replace_pairs:
node.inputs[i] = replace_pairs[inp]
else:
inp.owner = my_replace(inp.owner, replace_pairs)
return node
else:
return node
my_outs = op.outputs
for i, out in enumerate(my_outs):
my_outs[i].owner = my_replace(out.owner, replace_pairs)
for idx in xrange(len(my_outs)):
t = my_outs[idx]
nwout = tensor.Rebroadcast((0,False))(tensor.shape_padleft(t))
print 'replacing', node.outputs[idx], nwout
env.replace(node.outputs[idx],nwout)
# we are done ...
# is 30 soon enough !? I want to do it as early as possible .. such that
# the new graph gets optimized
#optdb.register('scanOp_remove_from_graph', ScanRemoveFromGraph() , 30, 'fast_run')
...@@ -3,74 +3,100 @@ from nose.plugins.skip import SkipTest ...@@ -3,74 +3,100 @@ from nose.plugins.skip import SkipTest
import unittest import unittest
import theano import theano
import numpy import numpy
import random import random
import numpy.random import numpy.random
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
def verify_grad(op, pt, n_tests=2, rng=None, eps = None, tol = None,
mode = None, cast_to_output_type = False):
pt = [numpy.array(p) for p in pt]
_type_tol = dict( float32=1e-2, float64=1e-4)
if tol is None:
tol = max(_type_tol[str(p.dtype)] for p in pt)
if rng is None:
rng = numpy.random
utt.seed_rng()
def function(inputs, outputs): class multiple_outputs_numeric_grad:
if mode is None: """WRITEME"""
f = theano.function(inputs, outputs, accept_inplace=True) type_eps = {'float64': 1e-7,
'float32': 3e-3}
def __init__(self, f, pt, ndarray_mask = None, eps=None):
"""Return the gradient of f at pt.
This function computes the gradient by a one-sided finite differences of a
fixed step size (eps).
It is assumed that f(...) will return a scalar.
:param eps: the stepsize for the finite differencing. None means input
dtype-dependent. See `type_eps`.
"""
def prod(inputs):
rval = 1
for i in inputs:
rval *= i
return rval
packed_pt = False
if not isinstance(pt, (list, tuple)):
pt = [pt]
packed_pt = True
# This mask tells us if we are dealing with an ndarray input or
# something else ( a random state ? ) with which we shouldn't really
# mess up
if not ndarray_mask:
ndarray_mask = [True for x in pt ]
dtype_eps = multiple_outputs_numeric_grad.type_eps['float64']
for i,p in enumerate(pt):
if ndarray_mask[i]:
pt[i] = numpy.array(p)
_eps = multiple_outputs_numeric_grad.type_eps[str(pt[i].dtype)]
if _eps > dtype_eps:
dtype_eps = _eps
# Compute clean output:
f_x = f(*pt)
gx = []
# now iterate over the elements of x and call f on those + delta x
for i in xrange(len(pt)):
if ndarray_mask[i]:
# It is a ndarray that we can tweak
_eps = eps if eps else dtype_eps
if pt[i].ndim :
_g = []
# it has several dimensions:
for pos in xrange(prod(pt[i].shape)):
t = pt[i].copy()
t = t.flatten()
t[pos] += _eps
t = t.reshape(pt[i].shape)
f_eps = f(*(pt[:i]+[t]+pt[i+1:]))
_g.append(numpy.asarray((f_eps - f_x)/_eps))
gx.append(numpy.asarray(_g).reshape(pt[i].shape))
else:
t= numpy.array(pt[i] + _eps)
f_eps = f(*(pt[:i]+[t]+pt[i+1:]))
gx.append(numpy.asarray((f_eps-f_x)/_eps))
self.gx = gx
@staticmethod
def abs_rel_err(a,b,eps=1.0e-10):
"""Return a small number when a and b are close, relative to how big they are"""
return abs(a-b) / (abs(a)+abs(b)+eps)
def max_err(self, g_pt):
"""Return the biggest relative error between g_pt and self.gx"""
if len(g_pt) != len(self.gx):
raise ValueError('argument has wrong number of elements', len(g_pt))
errs = []
for i, (a, b) in enumerate(zip(g_pt, self.gx)):
if a.shape != b.shape:
raise ValueError('argument element %i has wrong shape %s' %(i,str((a.shape,
b.shape))))
vv = multiple_outputs_numeric_grad.abs_rel_err(a,b)
errs.append(numpy.max(multiple_outputs_numeric_grad.abs_rel_err(a,b)))
if numpy.all(numpy.isfinite(errs)):
return numpy.max(errs), numpy.argmax(errs)
else: else:
f = theano.function(inputs,outputs,accept_inplace=True, mode=mode) return float('inf'), 0
return f
for test_num in xrange(n_tests):
tensor_pt=[theano.tensor.value(p.copy(),name='input %i'%i)
for i,p in enumerate(pt)]
# op outputs
o_outputs = op(*tensor_pt)
if not (type(o_outputs) in (list,tuple)):
o_outputs = [ o_outputs ]
o_fn = function(tensor_pt, o_outputs)
o_fn_outs = o_fn(*[p.copy() for p in pt])
if not type(o_fn_outs) in (list,tuple):
o_fn_outs = [o_fn_outs]
random_projection = rng.rand(*o_fn_outs[0].shape)
if cast_to_output_type:
random_projection = numpy.array(random_projection,
dtype = o_fn_outs[0].dtype)
t_r = theano.tensor.as_tensor_variable(random_projection)
cost = theano.tensor.sum( t_r * o_outputs[0])
for i, o in enumerate(o_fn_outs[1:] ):
random_projection = rng.rand(*o.shape)
if cast_to_output_type:
random_projection = numpy.array(random_projection,
dtype=o_outputs[i].dtype)
t_r = theano.tensor.as_tensor_variable(random_projection)
cost += theano.tensor.sum( t_r * o_outputs[i])
cost_fn = function(tensor_pt, cost)
num_grad = theano.tensor.numeric_grad(cost_fn,[p.copy() for p in pt],eps)
g_cost = theano.tensor.as_tensor_variable(1.0,name='g_cost')
if cast_to_output_type:
g_cost = cast(g_cost, o_output.dtype)
symbolic_grad = theano.tensor.grad(cost, tensor_pt, g_cost)
grad_fn = function(tensor_pt,symbolic_grad)
analytic_grad = grad_fn(*[p.copy() for p in pt])
if not isinstance(analytic_grad, (list,tuple)):
analytic_grad = [analytic_grad]
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > tol:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, tol, max_err_pos))
#TODO: Test this function, and if it works, #TODO: Test this function, and if it works,
...@@ -79,16 +105,22 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps = None, tol = None, ...@@ -79,16 +105,22 @@ def verify_grad(op, pt, n_tests=2, rng=None, eps = None, tol = None,
# Also - add a reference to this technique in the # Also - add a reference to this technique in the
# verify_grad method so that other ops with multiple outputs can be tested. # verify_grad method so that other ops with multiple outputs can be tested.
def scan_project_sum(*args, **kwargs): def scan_project_sum(*args, **kwargs):
rng = shared_randomstreams.RandomStreams() rng = theano.tensor.shared_randomstreams.RandomStreams(123)
scan_outputs = scan(*args, **kwargs) scan_outputs, updates = theano.scan(*args, **kwargs)
if type(scan_outputs) not in [list,tuple]:
scan_outputs = [scan_outputs]
# we should ignore the random-state updates so that # we should ignore the random-state updates so that
# the uniform numbers are the same every evaluation and on every call # the uniform numbers are the same every evaluation and on every call
rng.add_default_updates = False rng.add_default_updates = False
return sum([(s * rng.uniform(size=s.shape)).sum() for s in scan_outputs]) factors = [ rng.uniform(size=s.shape, low = 0.1, high = 0.9) for s in scan_outputs ]
# Random values (?)
return (sum([(s*f).sum() for s,f in zip(scan_outputs,factors)]),updates)
def asarrayX(value): def asarrayX(value):
return theano._asarray(value, dtype=theano.config.floatX) return theano._asarray(value, dtype=theano.config.floatX)
class T_Scan(unittest.TestCase): class T_Scan(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -372,7 +404,7 @@ class T_Scan(unittest.TestCase): ...@@ -372,7 +404,7 @@ class T_Scan(unittest.TestCase):
outputs, updates = theano.scan(f_rnn_shared, outputs, updates = theano.scan(f_rnn_shared,
[u0,dict(input = u1, taps = [0,1]),dict( input = u2, taps= [-1,0,+1])], [u0,dict(input = u1, taps = [0,1]),dict( input = u2, taps= [-1,0,+1])],
[dict( initial = x0, inplace =u2), dict(initial = x1, inplace = u1)], [dict( initial = x0, inplace =u2), dict(initial = x1, inplace = u1)],
[], n_steps = None, truncate_gradient = 01, go_backwards = False, mode=mode ) [], n_steps = None, truncate_gradient = -1, go_backwards = False, mode=mode )
f9 = theano.function([mu0,mu1,mu2,x0,x1], outputs , updates = updates, mode = mode) f9 = theano.function([mu0,mu1,mu2,x0,x1], outputs , updates = updates, mode = mode)
# compute output in numpy # compute output in numpy
...@@ -514,12 +546,10 @@ class T_Scan(unittest.TestCase): ...@@ -514,12 +546,10 @@ class T_Scan(unittest.TestCase):
v_vsample = numpy.array(rng.binomial(1,0.5, size=(3,20), ), dtype = 'float32') v_vsample = numpy.array(rng.binomial(1,0.5, size=(3,20), ), dtype = 'float32')
v_bvis = numpy.array(rng.rand(20) -.5, dtype='float32') v_bvis = numpy.array(rng.rand(20) -.5, dtype='float32')
v_bhid = numpy.array(rng.rand(30) -.5, dtype='float32') v_bhid = numpy.array(rng.rand(30) -.5, dtype='float32')
W = theano.shared(v_W) W = theano.shared(v_W)
bhid = theano.shared(v_bhid) bhid = theano.shared(v_bhid)
bvis = theano.shared(v_bvis) bvis = theano.shared(v_bvis)
vsample = theano.tensor.matrix(dtype='float32') vsample = theano.tensor.matrix(dtype='float32')
trng = theano.tensor.shared_randomstreams.RandomStreams(utt.fetch_seed()) trng = theano.tensor.shared_randomstreams.RandomStreams(utt.fetch_seed())
def f(vsample_tm1): def f(vsample_tm1):
...@@ -635,22 +665,264 @@ class T_Scan(unittest.TestCase): ...@@ -635,22 +665,264 @@ class T_Scan(unittest.TestCase):
f = theano.function([v,s], result, updates = updates) f = theano.function([v,s], result, updates = updates)
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
v_v = rng.uniform( size = (5,), low = -5., high = 5.) v_v = rng.uniform( size = (5,), low = -5., high = 5.)
print f(v_v,0.)
assert abs(numpy.sum(v_v) - f(v_v, 0.)) < 1e-3 assert abs(numpy.sum(v_v) - f(v_v, 0.)) < 1e-3
def test_grad_one_output(self):
def f_rnn(u_t,x_tm1,W_in, W):
return u_t*W_in+x_tm1*W
u = theano.tensor.vector('u')
x0 = theano.tensor.scalar('x0')
W_in = theano.tensor.scalar('W_in')
W = theano.tensor.scalar('W')
cost, updates = scan_project_sum(f_rnn, u, x0, [W_in,W], n_steps = None,
truncate_gradient = -1, go_backwards = False)
gu,gx0,gW_in,gW = theano.tensor.grad(cost, [u,x0,W_in, W])
grad_fn = theano.function([u,x0,W_in, W], [gu,gx0,gW_in, gW],
updates = updates, no_default_updates = True)
cost_fn = theano.function([u,x0,W_in, W], cost, updates = updates,
no_default_updates = True)
# get random initial values
rng = numpy.random.RandomState(utt.fetch_seed())
v_u = numpy.array(rng.uniform( size = (300,), low = -.5, high = .5),dtype=theano.config.floatX)
v_x0 = numpy.array(rng.uniform(), dtype= theano.config.floatX)
W = numpy.array(rng.uniform(), dtype= theano.config.floatX)
W_in = numpy.array(rng.uniform(), dtype= theano.config.floatX)
num_grad = multiple_outputs_numeric_grad(cost_fn, [v_u, v_x0, W_in, W])
analytic_grad = grad_fn(v_u, v_x0, W_in, W)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > 1e-2:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, 1e-2, max_err_pos))
def test_grad_multiple_outs(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vW_in2 = asarrayX(rng.uniform(size = (2,), low = -.1,high = .1))
vW = asarrayX(rng.uniform(size = (2,2), low = -.1,high = .1))
vWout = asarrayX(rng.uniform(size = (2,), low = -.1,high = .1))
vW_in1 = asarrayX(rng.uniform(size = (2,2), low = -.1,high = .1))
v_u1 = asarrayX(rng.uniform(size = (13,2), low = -.1, high = .1))
v_u2 = asarrayX(rng.uniform(size = (13,), low = -.1,high = .1))
v_x0 = asarrayX(rng.uniform(size = (2,), low = -.1,high = .1))
v_y0 = asarrayX(rng.uniform())
W_in2 = theano.shared(vW_in2, name='win2')
W = theano.shared(vW, name='w')
W_out = theano.shared(vWout, name = 'wout')
W_in1 = theano.tensor.matrix('win')
u1 = theano.tensor.matrix('u1')
u2 = theano.tensor.vector('u2')
x0 = theano.tensor.vector('x0')
y0 = theano.tensor.scalar('y0')
def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
return [theano.dot(u1_t,W_in1) + u2_t* W_in2 + \
theano.dot(x_tm1, W), theano.dot(x_tm1, W_out)]
cost, updates = scan_project_sum(f_rnn_cmpl,[u1,u2],[x0,y0],W_in1, n_steps = None,
truncate_gradient = -1, go_backwards = False)
vparams = [v_u1, v_u2, v_x0, v_y0,vW_in1]
params = [u1,u2,x0,y0,W_in1 ]
gparams = theano.tensor.grad(cost, params)
grad_fn = theano.function([u1,u2,x0,y0,W_in1], gparams,
updates = updates, no_default_updates = True)
cost_fn = theano.function([u1,u2,x0,y0,W_in1], cost,
updates = updates, no_default_updates = True)
num_grad = multiple_outputs_numeric_grad(cost_fn,[v_u1,v_u2,v_x0,v_y0,vW_in1])
analytic_grad = grad_fn(v_u1,v_u2, v_x0,v_y0, vW_in1)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > 1e-2:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, 1e-2, max_err_pos))
def test_grad_multiple_outs_taps(self):
l = 60
rng = numpy.random.RandomState(utt.fetch_seed())
vW_in2 = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
vW = asarrayX(rng.uniform(size = (2,2), low = -.2,high = .2))
vWout = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
vW_in1 = asarrayX(rng.uniform(size = (2,2), low = -.2,high = .2))
v_u1 = asarrayX(rng.uniform(size = (l,2), low = -.2, high = .2))
v_u2 = asarrayX(rng.uniform(size = (l+2,2), low = -.2,high = .2))
v_x0 = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
v_y0 = asarrayX(rng.uniform(size = (4,)))
W_in2 = theano.shared(vW_in2, name='win2')
W = theano.shared(vW, name='w')
W_out = theano.shared(vWout, name = 'wout')
W_in1 = theano.tensor.matrix('win')
u1 = theano.tensor.matrix('u1')
u2 = theano.tensor.matrix('u2')
x0 = theano.tensor.vector('x0')
y0 = theano.tensor.vector('y0')
def f_rnn_cmpl(u1_t, u2_tm1, u2_t, u2_tp1, x_tm1, y_tm1, y_tm3, W_in1):
return [theano.dot(u1_t,W_in1) + (u2_t+u2_tm1*u2_tp1)* W_in2 + \
theano.dot(x_tm1, W), (y_tm1+y_tm3)*theano.dot(x_tm1, W_out)]
cost, updates = scan_project_sum(f_rnn_cmpl,[u1,
dict(input=u2,taps=[-1,0,1])],[x0,dict(initial=y0,
taps=[-1,-3])],W_in1, n_steps = None,
truncate_gradient = -1, go_backwards = False)
vparams = [v_u1, v_u2, v_x0, v_y0,vW_in1]
params = [u1,u2,x0,y0,W_in1 ]
gparams = theano.tensor.grad(cost, params)
grad_fn = theano.function([u1,u2,x0,y0,W_in1], gparams,
updates = updates, no_default_updates = True)
cost_fn = theano.function([u1,u2,x0,y0,W_in1], cost,
updates = updates, no_default_updates = True)
num_grad = multiple_outputs_numeric_grad(cost_fn,[v_u1,v_u2,v_x0,v_y0,vW_in1])
analytic_grad = grad_fn(v_u1,v_u2, v_x0,v_y0, vW_in1)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > 1e-2:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, 1e-2, max_err_pos))
def test_grad_multiple_outs_taps_backwards(self):
l = 20
rng = numpy.random.RandomState(utt.fetch_seed())
vW_in2 = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
vW = asarrayX(rng.uniform(size = (2,2), low = -.2,high = .2))
vWout = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
vW_in1 = asarrayX(rng.uniform(size = (2,2), low = -.2,high = .2))
v_u1 = asarrayX(rng.uniform(size = (l,2), low = -.2, high = .2))
v_u2 = asarrayX(rng.uniform(size = (l+2,2), low = -.2,high = .2))
v_x0 = asarrayX(rng.uniform(size = (2,), low = -.2,high = .2))
v_y0 = asarrayX(rng.uniform(size = (4,)))
W_in2 = theano.shared(vW_in2, name='win2')
W = theano.shared(vW, name='w')
W_out = theano.shared(vWout, name = 'wout')
W_in1 = theano.tensor.matrix('win')
u1 = theano.tensor.matrix('u1')
u2 = theano.tensor.matrix('u2')
x0 = theano.tensor.vector('x0')
y0 = theano.tensor.vector('y0')
def f_rnn_cmpl(u1_t, u2_tm1, u2_t, u2_tp1, x_tm1, y_tm1, y_tm3, W_in1):
return [theano.dot(u1_t,W_in1) + (u2_t+u2_tm1*u2_tp1)* W_in2 + \
theano.dot(x_tm1, W), (y_tm1+y_tm3)*theano.dot(x_tm1, W_out)]
cost, updates = scan_project_sum(f_rnn_cmpl,[u1,
dict(input=u2,taps=[-1,0,1])],[x0,dict(initial=y0,
taps=[-1,-3])],W_in1, n_steps = None,
truncate_gradient = -1, go_backwards = True)
vparams = [v_u1, v_u2, v_x0, v_y0,vW_in1]
params = [u1,u2,x0,y0,W_in1 ]
gparams = theano.tensor.grad(cost, params)
grad_fn = theano.function([u1,u2,x0,y0,W_in1], gparams,
updates = updates, no_default_updates = True)
cost_fn = theano.function([u1,u2,x0,y0,W_in1], cost,
updates = updates, no_default_updates = True)
num_grad = multiple_outputs_numeric_grad(cost_fn,[v_u1,v_u2,v_x0,v_y0,vW_in1])
analytic_grad = grad_fn(v_u1,v_u2, v_x0,v_y0, vW_in1)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > 1e-2:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, 1e-2, max_err_pos))
def test_grad_multiple_outs_some_uncomputable(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vW_in = asarrayX(rng.uniform(size = (2,2), low = -.1,high = .1))
v_u = asarrayX(rng.uniform(size = (80,2), low = -.1, high = .1))
v_x0 = asarrayX(rng.uniform(size = (2,), low = -.1,high = .1))
W_in = theano.tensor.matrix('win')
u = theano.tensor.matrix('u1')
x0 = theano.tensor.vector('x0')
# trng = theano.tensor.shared_randomstreams.RandomStreams(utt.fetch_seed())
def f_rnn_cmpl(u_t, x_tm1, W_in):
trng1 = theano.tensor.shared_randomstreams.RandomStreams(123)
x_t = theano.dot(u_t, W_in) + x_tm1 + trng1.uniform(low=-.1, high=.1)
return x_t
cost, updates = scan_project_sum(f_rnn_cmpl,u,x0,W_in, n_steps = None,
truncate_gradient = -1, go_backwards = False)
vparams = [v_u, v_x0,vW_in]
params = [u,x0,W_in ]
gparams = theano.tensor.grad(cost, params)
grad_fn = theano.function([u,x0,W_in], gparams,
updates = updates, no_default_updates = True)
cost_fn = theano.function([u,x0,W_in], cost,
updates = updates, no_default_updates = True)
def reset_rng_cost_fn(*args):
for idx,arg in enumerate(cost_fn.maker.expanded_inputs):
if arg.value and type(arg.value.data) == type(numpy.random.RandomState(123)):
cost_fn.maker.expanded_inputs[idx].value.data = numpy.random.RandomState(123)
return cost_fn(*args)
def reset_rng_grad_fn(*args):
for idx,arg in enumerate(grad_fn.maker.expanded_inputs):
if arg.value and type(arg.value.data)==type(numpy.random.RandomState(123)):
grad_fn.maker.expanded_inputs[idx].value.data = numpy.random.RandomState(123)
return grad_fn(*args)
num_grad = multiple_outputs_numeric_grad(reset_rng_cost_fn,\
[v_u,v_x0,vW_in] )
analytic_grad = reset_rng_grad_fn(v_u, v_x0, vW_in)
max_err, max_err_pos = num_grad.max_err(analytic_grad)
if max_err > 1e-2:
raise Exception(theano.tensor.verify_grad.E_grad,
(max_err, 1e-2, max_err_pos))
def test_grad_multiple_outs_some_truncate(self):
rng = numpy.random.RandomState(utt.fetch_seed())
vW_in = asarrayX(rng.uniform(size = (2,2), low = -.1,high = .1))
v_u = asarrayX(rng.uniform(size = (80,2), low = -.1, high = .1))
v_x0 = asarrayX(rng.uniform(size = (2,), low = -.1,high = .1))
W_in = theano.tensor.matrix('win')
u = theano.tensor.matrix('u1')
x0 = theano.tensor.vector('x0')
# trng = theano.tensor.shared_randomstreams.RandomStreams(utt.fetch_seed())
def f_rnn_cmpl(u_t, x_tm1, W_in):
trng1 = theano.tensor.shared_randomstreams.RandomStreams(123)
x_t = theano.dot(u_t, W_in) + x_tm1 + trng1.uniform(low=-.1, high=.1)
return x_t
cost, updates = scan_project_sum(f_rnn_cmpl,u,x0,W_in, n_steps = None,
truncate_gradient = 40, go_backwards = False)
vparams = [v_u, v_x0,vW_in]
params = [u,x0,W_in ]
gparams = theano.tensor.grad(cost, params)
grad_fn = theano.function([u,x0,W_in], gparams,
updates = updates, no_default_updates = True)
cost_fn = theano.function([u,x0,W_in], cost,
updates = updates, no_default_updates = True)
def reset_rng_cost_fn(*args):
for idx,arg in enumerate(cost_fn.maker.expanded_inputs):
if arg.value and type(arg.value.data) == type(numpy.random.RandomState(123)):
cost_fn.maker.expanded_inputs[idx].value.data = numpy.random.RandomState(123)
return cost_fn(*args)
def reset_rng_grad_fn(*args):
for idx,arg in enumerate(grad_fn.maker.expanded_inputs):
if arg.value and type(arg.value.data)==type(numpy.random.RandomState(123)):
grad_fn.maker.expanded_inputs[idx].value.data = numpy.random.RandomState(123)
return grad_fn(*args)
num_grad = multiple_outputs_numeric_grad(reset_rng_cost_fn,\
[v_u,v_x0,vW_in] )
analytic_grad = reset_rng_grad_fn(v_u, v_x0, vW_in)
assert len(analytic_grad[0]) == 40
'''
TO TEST:
- test gradient (one output)
- test gradient (multiple outputs)
- test gradient (go_bacwards)
- test gradient (multiple outputs / some uncomputable )
- test gradient (truncate_gradient)
- test_gradient (taps past/future)
- optimization !?
'''
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论