提交 98213004 authored 作者: James Bergstra's avatar James Bergstra

merge w conflict in nvcc_compiler due to rpath thing

"""ProfileStats object for runtime and memory profiling.
"""
#
# TODO: measure memory usage like ProfileMode did
# TODO: put the optimization tips into a tips section??
# TODO: add tip to use specify_shape (is specify_shape even in library doc?)
# TODO: ensure field width for string fields makes columns line up
# TODO: what to do about 'diff summary'? (ask Fred?)
#
__authors__ = "James Bergstra"
__reviewer__ = "Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en"
import atexit
import sys
import theano
config = theano.config
_atexit_print_list = []
_atexit_print_file = sys.stderr
def _atexit_print_fn():
"""Print ProfileStat objects in _atexit_print_list to _atexit_print_file
"""
for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0:
ps.summary(file=_atexit_print_file)
else:
print 'Skipping empty Profile'
atexit.register(_atexit_print_fn)
class ProfileStats(object):
"""
Object to store runtime and memory profiling information for all of
Theano's operations: compilation, optimization, execution.
"""
#
# Note on implementation:
# Class variables are used here so that each one can be
# documented and initialized together.
# dictionary variables are initialized with None.
#
compile_time = 0.0
# Total time spent in body of orig_function,
# dominated by graph optimization and compilation of C
#
fct_call_time = 0.0
# The total time spent in Function.__call__
#
fct_callcount = 0
# Number of calls to Function.__call__
#
vm_call_time = 0.0
# Total time spent in Function.fn.__call__
#
apply_time = None
# dict from node -> float runtime
#
apply_callcount = None
# dict from node -> number of executions
#
apply_cimpl = None
# dict from node -> bool (1 if c, 0 if py)
#
message = None
# pretty string to print in summary, to identify this output
#
outputs_size = None
# node -> size of allocated output
#
optimizer_time = 0.0
# time spent optimizing graph (FunctionMaker.__init__)
linker_time = 0.0
# time spent linking graph (FunctionMaker.create)
def __init__(self, atexit_print=True, **kwargs):
"""
atexit_print - bool. True means that this object will be printed to
stderr (using .summary()) at the end of the program.
**kwargs - misc initializers. These should (but need not) match the
names of the class vars declared in this class.
"""
self.apply_callcount = {}
self.output_size = {}
self.apply_time = {}
self.apply_cimpl = {}
self.__dict__.update(kwargs)
#print >> sys.stderr, "self.message", self.message
if atexit_print:
global _atexit_print_list
_atexit_print_list.append(self)
def op_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, t in self.apply_time.items():
rval.setdefault(node.op, 0)
rval[node.op] += t
return rval
def op_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += count
return rval
def op_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
def op_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
for node in self.apply_callcount:
if self.apply_cimpl[node]:
rval[node.op] = 'C '
else:
rval[node.op] = 'Py'
return rval
def op_flops(self):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
return rval #TODO: continue here
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
for a,t in op_time.items():
if hasattr(a,'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6
flops_msg=''
if op_flops:
flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_ops: total time 0'
' (did you forget to enable counters?)')
return
op_time = self.op_time()
op_call = self.op_callcount()
op_apply = self.op_nodes()
op_flops = self.op_flops()
op_impl = self.op_impl()
if N is None:
N = len(self.op_flops)
otimes = [(t*100/local_time,
t,
op,
op_impl.get(op, ' '),
op_call.get(op, 0),
op_apply.get(op,0))
for op, t in op_time.items()]
otimes.sort()
otimes.reverse()
tot=0
print >> file, 'Ops'
print >> file, '---'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if op_flops:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
else:
print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
def summary_nodes(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_nodes: total time 0'
' (did you forget to enable counters?)')
return
print >> file, 'Thunks'
print >> file, '------'
print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Apply Op name>'
atimes = [(
t*100/local_time,
t,
a,
self.apply_callcount[a])
for a, t in self.apply_time.items()]
atimes.sort()
atimes.reverse()
tot=0
for (f, t, a, nb_call) in atimes[:N]:
tot+=t
ftot=tot*100/local_time
if nb_call==0:
continue
print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-N),
sum(f for f, t, a, nb_call in atimes[N:]),
sum(t for f, t, a, nb_call in atimes[N:]))
print >> file, ''
def summary_function(self, file):
print >> file, 'Function profiling'
print >> file, '=================='
print >> file, ' Message: %s'%self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time)
if self.fct_call_time>0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' %(
self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values())
if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' %(
local_time, 100*local_time / self.fct_call_time)
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
else:
print >> file, " No node time accumulated (hint: try config profiling.time_thunks=1)"
if 0: # old code still to be ported from ProfileMode
def long_print(self, file=sys.stderr, fct_name=None, message=None,
n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
"""
Print a readable summary of the stats.
param: n_apply_to_print the number of apply to print. Default 15.
param: n_ops_to_print the number of ops to print. Default 20.
"""
local_time = sum(self.apply_time.values())
print ''
print 'ProfileMode.long_print()'
print 'name = %s'%fct_name
print 'msg = %s'%message
print '---------------------------'
print ''
print 'Total time spent running thunks: %.3fs'% local_time
sop_time={}
sop_call={}
sop_op = {}
sop_c={} #map each op class to Bool. True iff all applies were done in c.
for a,t in op_time.items():
typ = type(a)
sop_time.setdefault(typ,0)
sop_time[typ]+=t
sop_op.setdefault(typ,0)
sop_op[typ]+=1
sop_c.setdefault(typ,True)
sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ]=sop_call.get(typ,0)+op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes.sort()
sotimes.reverse()
tot=0
for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
if ci:
msg = '*'
else:
msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values())
total_fct_call = sum(fct_call.values())
other_time = total_time - total_fct_time - compile_time
print
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys():
if fct_call[key]>0:
print ' %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
fct_call_time[key]/fct_call[key], fct_call[key], key.name)
else:
print ' NOT CALLED',key.name
if total_fct_time>0:
time_pr_in_fct=local_time/total_fct_time*100
time_per_call=total_fct_time/total_fct_call
else:
time_pr_in_fct=0
time_per_call=0
print
print 'Time since import %.3fs'%(total_time)
print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
print
print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.env.toposort()):
if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu=[]
gpu=[]
trans=[]
for so in sotimes:
if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
trans.append(so)
elif so[2].__name__.startswith("Gpu"):
gpu.append(so)
else:
cpu.append(so)
sum_cpu=sum(so[1] for so in cpu)
sum_gpu=sum(so[1] for so in gpu)
sum_trans=sum(so[1] for so in trans)
print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
print "Theano function input that are float64"
print "<fct name> <input name> <input type> <str input>"
for fct in fct_call.keys():
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#env->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.env,{})
fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for env,nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = env.toposort()
computed, last_user = gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in env.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print
print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
from theano import tensor as T
from theano.tensor.raw_random import RandomFunction
import theano
import theano.scalar as scal
scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos, scal.Sin, scal.Tan, scal.Tanh, scal.Cosh, scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
def get_scalar_ops(s):
if isinstance(s, theano.scalar.Composite):
l = []
for node in s.env.toposort():
l+=get_scalar_ops(node.op)
return l
else: return [s]
def list_scalar_op(op):
if isinstance(op.scalar_op, theano.scalar.Composite):
return get_scalar_ops(op.scalar_op)
else: return [op.scalar_op]
def amdlibm_speed_up(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
for s_op in l:
if s_op.__class__ in scalar_op_amdlibm_speed_up:
return True
elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
import pdb;pdb.set_trace()
print "We don't know if amdlibm will accelerate this scalar op.", s_op
return False
def exp_float32_op(op):
if not isinstance(op, T.Elemwise):
return False
else:
l = list_scalar_op(op)
return any([s_op.__class__ in [scal.Exp] for s_op in l])
#tip 1
if config.floatX=='float64':
print " - Try the Theano flag floatX=float32"
#tip 2
if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
print " - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
#tip 3
if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
print " - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
#tip 4
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
print " - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
#tip 5
for a, t in apply_time.iteritems():
node = a
if isinstance(node.op, RandomFunction):
print " - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
if config.device.startswith("gpu"):
print " - MRG_RandomStreams is the only random number supported on the GPU."
break
def print_summary(self,
n_apply_to_print=config.ProfileMode.n_apply_to_print,
n_ops_to_print=config.ProfileMode.n_ops_to_print):
""" Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
Their is an hack with the Op-wise summary. Go see it if you want to know more.
:param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
:param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.
"""
fct_call_time = self.mode.fct_call_time
fct_call = self.mode.fct_call
apply_time = self.apply_time
op_cimpl = self.op_cimpl
message = self.message
outputs_size = self.outputs_size
self.print_summary_("print_summary",
None,
None,
None,
apply_time,
op_cimpl,
message,
outputs_size,
n_apply_to_print,
n_ops_to_print)
def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
""" As print_summary, but print the difference on two different profile mode.
TODO: Also we don't print the Apply-wise summary as it don't work for now.
TODO: make comparaison with gpu code.
:param other: the other instance of ProfileMode that we want to be compared to.
:param n_apply_to_print: the number of apply to print. Default 15.
:param n_ops_to_print: the number of ops to print. Default 20.
"""
def diff_dict(a_time,b_time_):
r = {}
b_time = copy.copy(b_time_)
for a,ta in a_time.items():
r.setdefault(a,0)
tb = b_time.pop(a,0)
r[a]+=ta-tb
#they are missing in a
for a,t in b_time.items():
r.setdefault(a,0)
r[a]+=t
return r
compile_time = self.compile_time-other.compile_time
fct_call_time = diff_dict(self.fct_call_time,other.fct_call_time)
fct_call = diff_dict(self.fct_call,other.fct_call)
apply_time = diff_dict(self.apply_time, other.apply_time)
op_cimpl = self.op_cimpl and other.op_cimpl
message = self.message
outputs_size = diff_dict(self.outputs_size,other.outputs_size)
self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False)
"""
Test compilation modes
"""
from nose.plugins.skip import SkipTest
import unittest
import theano
import numpy
import random
import numpy.random
from theano.tests import unittest_tools as utt
class T_bunch_of_modes(unittest.TestCase):
def test1(self):
# this is a quick test after the LazyLinker branch merge
# to check that all the current modes can still be used.
linker_classes_involved = []
for modename in theano.config.__class__.__dict__['mode'].all:
x = T.matrix()
y = T.vector()
f = theano.function([x,y], x+y, mode=modename)
# test that it runs something
f([[1,2],[3,4]], [5, 6])
linker_classes_involved.append(f.maker.mode.linker.__class__)
print 'MODE:', modename, f.maker.mode.linker, 'stop'
# regression check:
# there should be
# - VM_Linker
# - OpWiseCLinker (FAST_RUN)
# - WrapLinker (PROFILE_MODE)
# - PerformLinker (FAST_COMPILE)
# - DebugMode's Linker (DEBUG_MODE)
assert 5 == len(set(linker_classes_involved))
if __name__ == '__main__':
unittest.main()
...@@ -146,7 +146,7 @@ from link import \ ...@@ -146,7 +146,7 @@ from link import \
Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
from op import \ from op import \
Op Op, PureOp
from opt import (Optimizer, optimizer, SeqOptimizer, from opt import (Optimizer, optimizer, SeqOptimizer,
MergeOptimizer, MergeOptMerge, MergeOptimizer, MergeOptMerge,
......
...@@ -13,7 +13,7 @@ AddConfigVar('nvcc.compiler_bindir', ...@@ -13,7 +13,7 @@ AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc in this directory", "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
StrParam("")) StrParam(""))
AddConfigVar('cuda.nvccflags', AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc", "Extra compiler flags for nvcc",
StrParam("")) StrParam(""))
...@@ -183,11 +183,9 @@ def nvcc_module_compile_str( ...@@ -183,11 +183,9 @@ def nvcc_module_compile_str(
if sys.platform != 'darwin': if sys.platform != 'darwin':
# the 64bit CUDA libs are in the same files as are named by the function above # the 64bit CUDA libs are in the same files as are named by the function above
rpaths.append(os.path.join(config.cuda.root,'lib64')) rpaths.append(os.path.join(config.cuda.root,'lib64'))
for rpath in rpaths: for rpath in rpaths:
cmd.extend(['-Xlinker',','.join(['-rpath',rpath])]) cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag] cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
cmd.extend(nvccflags)
cmd.extend('-I%s'%idir for idir in include_dirs) cmd.extend('-I%s'%idir for idir in include_dirs)
cmd.extend(['-o',lib_filename]) cmd.extend(['-o',lib_filename])
cmd.append(os.path.split(cppfilename)[-1]) cmd.append(os.path.split(cppfilename)[-1])
......
...@@ -133,6 +133,79 @@ def sp_ones_like(x): ...@@ -133,6 +133,79 @@ def sp_ones_like(x):
data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape) return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)
class _sparse_py_operators:
T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
def __neg__(self): return neg(self)
def __add__(left, right): return add(left, right)
def __radd__(right, left): return add(left, right)
def __sub__(left, right): return sub(left, right)
def __rsub__(right, left): return sub(left, right)
def __mul__(left, right): return mul(left, right)
def __rmul__(left, right): return mul(left, right)
#extra pseudo-operator symbols
def __dot__(left, right): return structured_dot(left, right)
def __rdot__(right, left): return structured_dot(left, right)
#N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
# Discussion with Fred & James (at least, and maybe others before)
# we decided that casting from a sparse to dense should be explicit
# because it's usually something you want to be pretty careful about,
# and not to do by accident.
#def _as_TensorVariable(self):
# return dense_from_sparse(self)
shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
# ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
# ... and remove the dense_from_sparse from the graph. This will *NOT* actually expand
# ... your sparse matrix just to get the shape.
ndim = property(lambda self: self.type.ndim)
dtype = property(lambda self: self.type.dtype)
class SparseVariable(gof.Variable, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def __str__(self):
return '%s{%s,%s}'%(
self.__class__.__name__,
self.format,
self.dtype)
def __repr__(self):
return str(self)
class SparseConstantSignature(tuple):
def __eq__(self, other):
(a, b), (x,y) = self, other
return a == x \
and (b.dtype == y.dtype)\
and (type(b) == type(y))\
and (b.shape == y.shape)\
and (abs(b-y).sum() < 1e-6 * b.nnz)
def __hash__(self):
(a,b) = self
return hash(type(self)) ^ hash(a) ^ hash(type(b))
class SparseConstant(gof.Constant, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def signature(self):
assert self.data is not None
return SparseConstantSignature((self.type, self.data))
def __str__(self):
return '%s{%s,%s,shape=%s,nnz=%s}'%(
self.__class__.__name__,
self.format,
self.dtype,
self.data.shape,
self.data.nnz)
def __repr__(self):
return str(self)
class SparseValue(gof.Value, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
class SparseType(gof.Type): class SparseType(gof.Type):
""" """
...@@ -149,6 +222,9 @@ class SparseType(gof.Type): ...@@ -149,6 +222,9 @@ class SparseType(gof.Type):
dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128']) dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128'])
ndim = 2 ndim = 2
Variable = SparseVariable
Constant = SparseConstant
def __init__(self, format, dtype): def __init__(self, format, dtype):
""" """
Fundamental way to create a sparse node. Fundamental way to create a sparse node.
...@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64') ...@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64')
csc_fmatrix = SparseType(format='csc', dtype='float32') csc_fmatrix = SparseType(format='csc', dtype='float32')
csr_fmatrix = SparseType(format='csr', dtype='float32') csr_fmatrix = SparseType(format='csr', dtype='float32')
class _sparse_py_operators:
T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
def __neg__(self): return neg(self)
def __add__(left, right): return add(left, right)
def __radd__(right, left): return add(left, right)
def __sub__(left, right): return sub(left, right)
def __rsub__(right, left): return sub(left, right)
def __mul__(left, right): return mul(left, right)
def __rmul__(left, right): return mul(left, right)
#extra pseudo-operator symbols
def __dot__(left, right): return structured_dot(left, right)
def __rdot__(right, left): return structured_dot(left, right)
#N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
# Discussion with Fred & James (at least, and maybe others before)
# we decided that casting from a sparse to dense should be explicit
# because it's usually something you want to be pretty careful about,
# and not to do by accident.
#def _as_TensorVariable(self):
# return dense_from_sparse(self)
shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
# ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
# ... and remove the dense_from_sparse from the graph. This will *NOT* actually expand
# ... your sparse matrix just to get the shape.
ndim = property(lambda self: self.type.ndim)
dtype = property(lambda self: self.type.dtype)
class SparseVariable(gof.Variable, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
class SparseConstantSignature(tuple):
def __eq__(self, other):
(a, b), (x,y) = self, other
return a == x \
and (b.dtype == y.dtype)\
and (type(b) == type(y))\
and (b.shape == y.shape)\
and (abs(b-y).sum() < 1e-6 * b.nnz)
def __hash__(self):
(a,b) = self
return hash(type(self)) ^ hash(a) ^ hash(type(b))
class SparseConstant(gof.Constant, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
def signature(self):
assert self.data is not None
return SparseConstantSignature((self.type, self.data))
class SparseValue(gof.Value, _sparse_py_operators):
dtype = property(lambda self: self.type.dtype)
format = property(lambda self: self.type.format)
# CONSTRUCTION # CONSTRUCTION
class CSMProperties(gof.Op): class CSMProperties(gof.Op):
"""Extract all of .data .indices and .indptr""" """Extract all of .data .indices and .indptr"""
......
...@@ -937,6 +937,9 @@ def _gemm_from_node2(node): ...@@ -937,6 +937,9 @@ def _gemm_from_node2(node):
lst = _factor_canonicalized(lst) lst = _factor_canonicalized(lst)
rval = _gemm_from_factored_list(lst) rval = _gemm_from_factored_list(lst)
#print "RVAL", rval #print "RVAL", rval
# THIS GOT COMMENTED OUT AT SOME POINT - ASK P.Lamblin maybe why?
#if rval:
# assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
if rval and (rval[0].type == node.outputs[0].type): if rval and (rval[0].type == node.outputs[0].type):
return rval return rval
......
...@@ -3057,30 +3057,33 @@ def constant_folding(node): ...@@ -3057,30 +3057,33 @@ def constant_folding(node):
for input in node.inputs: for input in node.inputs:
if not isinstance(input, Constant): if not isinstance(input, Constant):
return False return False
try: #condition: all inputs are constant
storage = [[None] for output in node.outputs]
node.op.perform(node, [x.data for x in node.inputs], storage) storage_map=dict([(i,[i.data]) for i in node.inputs])
except MethodNotDefined: compute_map=dict([(i,[True]) for i in node.inputs])
tmp_inputs = [x.type() for x in node.inputs] for o in node.outputs:
f = compile.function( storage_map[o] = [None]
inputs=tmp_inputs, compute_map[o] = [False]
outputs=node.op.make_node(*tmp_inputs).outputs,
mode=compile.Mode(linker='c|py',optimizer=None)) thunk = node.op.make_thunk(node, storage_map, compute_map,
xvals = f(*[x.data for x in node.inputs]) no_recycling=[])
storage = [[xv] for xv in xvals]
required = thunk()
msg = [] assert not required # a node whose inputs are all provided should always
assert len(storage) == len(node.outputs) # return successfully
for s, output in zip(storage, node.outputs):
rval = []
for output in node.outputs:
assert compute_map[output][0], (output, storage_map[output][0])
try: try:
constant = output.type.Constant constant = output.type.Constant
except: except AttributeError:
constant = Constant constant = Constant
msg += [constant(output.type, s[0])] rval.append(constant(output.type, storage_map[output][0]))
return msg return rval
register_canonicalize(constant_folding, 'fast_compile') register_canonicalize(constant_folding, 'fast_compile')
register_stabilize(constant_folding) # because register_stabilize(constant_folding)
register_specialize(constant_folding) register_specialize(constant_folding)
def _is_1(expr): def _is_1(expr):
......
...@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase): ...@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase):
rng_R = random_state_type() rng_R = random_state_type()
# use make_node to override some of the self.args # use make_node to override some of the self.args
post_r2, out2 = rf2(rng_R, (4,), -2, 2) post_r2, out2 = rf2(rng_R, (4,), -2, 2) # NOT INPLACE
post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2) post_r4, out4 = rf4(rng_R, (4,), -4, 4) # INPLACE
post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2) # NOT INPLACE
post_r4, out4 = rf4(rng_R, (4,), -4, 4) post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) # NOT INPLACE
# configure out4 to be computed inplace
# The update expression means that the random state rng_R will
# be maintained by post_r4
f = compile.function( f = compile.function(
[compile.In(rng_R, [compile.In(rng_R,
value=numpy.random.RandomState(utt.fetch_seed()), value=numpy.random.RandomState(utt.fetch_seed()),
...@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase): ...@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase):
f2, f4, f2_4, f2_4_4 = f() f2, f4, f2_4, f2_4_4 = f()
f2b, f4b, f2_4b, f2_4_4b = f() f2b, f4b, f2_4b, f2_4_4b = f()
assert numpy.allclose(f2*2, f4) print f2
assert numpy.allclose(f2_4_4, f4) print f4
assert not numpy.allclose(f4, f4b) print f2_4
print f2_4_4
#print f2b
#print f4b
#print f2_4b
#print f2_4_4b
# setting bounds is same as multiplying by 2
assert numpy.allclose(f2*2, f4), (f2, f4)
# retrieving from non-inplace generator
# is same as inplace one for first call
assert numpy.allclose(f2_4_4, f4), (f2_4_4, f4)
# f4 changes from call to call, that the update has worked
assert not numpy.allclose(f4, f4b), (f4, f4b)
def test_inplace_optimization(self): def test_inplace_optimization(self):
"""Test that FAST_RUN includes the random_make_inplace optimization""" """Test that FAST_RUN includes the random_make_inplace optimization"""
......
...@@ -13,19 +13,32 @@ from theano.tests import unittest_tools as utt ...@@ -13,19 +13,32 @@ from theano.tests import unittest_tools as utt
should ensure that it will remain operational should ensure that it will remain operational
''' '''
class T_diverse(unittest.TestCase): class T_scipy(unittest.TestCase):
def setUp(self): def setUp(self):
utt.seed_rng() utt.seed_rng()
self.orig_floatX = theano.config.floatX
def tearDown(self):
theano.config.floatX = self.orig_floatX
def scipy_paper_example1(self): def test_scipy_paper_example1(self):
a = theano.tensor.vector('a') # declare variable a = theano.tensor.vector('a') # declare variable
b = a + a**10 # build expression b = a + a**10 # build expression
f = theano.function([a], b) # compile function f = theano.function([a], b) # compile function
assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026])) assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
def scipy_papaer_example2(self): def test_scipy_paper_example2(self):
''' This just sees if things compile well and if they run ''' ''' This just sees if things compile well and if they run '''
# PREAMPBLE
T = theano.tensor
shared = theano.shared
function = theano.function
rng = numpy.random
theano.config.floatX='float64'
#
# ACTUAL SCRIPT FROM PAPER
x = T.matrix() x = T.matrix()
y = T.vector() y = T.vector()
w = shared(rng.randn(100)) w = shared(rng.randn(100))
...@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase): ...@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase):
for i in range(training_steps): for i in range(training_steps):
pred, err = train(D[0], D[1]) pred, err = train(D[0], D[1])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论