提交 941ea07c authored 作者: lamblin's avatar lamblin

Merge pull request #511 from nouiz/pep8

Pep8
...@@ -28,6 +28,14 @@ New Features ...@@ -28,6 +28,14 @@ New Features
anything below an intermediate variable that has a name. Defaults to False. anything below an intermediate variable that has a name. Defaults to False.
* debugprint does not print anymore the "|" symbol in a column after the last input. * debugprint does not print anymore the "|" symbol in a column after the last input.
Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people)
* They are all in the theano.sparse.sandbox.sp2 module
* Op class: Cast, Poisson, Multinomial, EliminateZeros, Sum, Binomial
* Op class: SamplingDot, SamplingDotCsr(inserted automatically)
* Op function: structured_sigmoid, structured_exp, structured_pow, structured_minimum,
* Op class: StructuredAddSV, StrucutedAddSVCSR(inserted automatically)
* opt: local_sampling_dot_csr, local_structured_add_s_v
Internal changes Internal changes
* Define new exceptions MissingInputError and UnusedInputError, and use them * Define new exceptions MissingInputError and UnusedInputError, and use them
in theano.function, instead of TypeError and ValueError. (Pascal L.) in theano.function, instead of TypeError and ValueError. (Pascal L.)
......
...@@ -317,7 +317,48 @@ bindings to work only on Python files. ...@@ -317,7 +317,48 @@ bindings to work only on Python files.
Emacs Emacs
~~~~~ ~~~~~
WRITEME There is an **execellent** system to configure emacs for python:
`emacs-for-python
<https://github.com/gabrielelanaro/emacs-for-python>`_. It gatter many
emacs config into one and modify them to behave together nicely. You
can use it to check for pep8 compliance and for python syntax errors.
To install it on linux, you can do like this:
.. code-block:: bash
cd
git clone https://github.com/gabrielelanaro/emacs-for-python.git .emacs.d/emacs-for-python
Then in your ``~/.emacs`` file, add this:
.. code-block:: bash
;; Mandatory
(load-file "~/.emacs.d/emacs-for-python/epy-init.el")
(add-to-list 'load-path "~/.emacs.d/emacs-for-python/") ;; tell where to load the various files
;; Each of them enable different part of the system
;; only the 2 first are needed for pep8, syntax check.
(require 'epy-setup) ;; It will setup other loads, it is required!
(require 'epy-python) ;; If you want the python facilities [optional]
(require 'epy-completion) ;; If you want the autocompletion settings [optional]
(require 'epy-editing) ;; For configurations related to editing [optional]
;; define f10 to previous error
;; define f11 to next error
(require 'epy-bindings) ;; For my suggested keybindings [optional]
;; some shortcut that don't collide with gnome-terminal
;; otherwise, "epy-bindings" define f10 and f11 for them.
(global-set-key [f2] 'flymake-goto-prev-error)
(global-set-key [f3] 'flymake-goto-next-error)
;; next two lines are the checks to do. You can add more if you wish
(epy-setup-checker "pyflakes %f") ;; for python syntax check
(epy-setup-checker "pep8 %f") ;; for pep8 check
Unit tests Unit tests
......
...@@ -22,7 +22,7 @@ import time ...@@ -22,7 +22,7 @@ import time
import numpy import numpy
import theano import theano
from theano.configparser import AddConfigVar, StrParam, BoolParam from theano.configparser import AddConfigVar, BoolParam
import_time = time.time() import_time = time.time()
config = theano.config config = theano.config
...@@ -46,8 +46,10 @@ def _atexit_print_fn(): ...@@ -46,8 +46,10 @@ def _atexit_print_fn():
if len(_atexit_print_list) > 1: if len(_atexit_print_list) > 1:
# Make a global profile # Make a global profile
cum = copy.copy(_atexit_print_list[0]) cum = copy.copy(_atexit_print_list[0])
cum.message = "Sum of all Theano functions" cum.message = "Sum of all printed profile at exit"
for ps in _atexit_print_list[1:]: for ps in _atexit_print_list[1:]:
# for ps in [ps for ps in _atexit_print_list[1:]
# if not isinstance(ps, ScanProfileStats)]:
for attr in ["compile_time", "fct_call_time", "fct_callcount", for attr in ["compile_time", "fct_call_time", "fct_callcount",
"vm_call_time", "optimizer_time", "linker_time"]: "vm_call_time", "optimizer_time", "linker_time"]:
setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr)) setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
...@@ -62,6 +64,7 @@ def _atexit_print_fn(): ...@@ -62,6 +64,7 @@ def _atexit_print_fn():
atexit.register(_atexit_print_fn) atexit.register(_atexit_print_fn)
class ProfileStats(object): class ProfileStats(object):
""" """
Object to store runtime and memory profiling information for all of Object to store runtime and memory profiling information for all of
...@@ -119,6 +122,7 @@ class ProfileStats(object): ...@@ -119,6 +122,7 @@ class ProfileStats(object):
# time spent linking graph (FunctionMaker.create) # time spent linking graph (FunctionMaker.create)
line_width = 140 line_width = 140
# param is called flag_time_thunks because most other attributes with time # param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags. # in the name are times *of* something, rather than configuration flags.
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs): def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
...@@ -185,20 +189,27 @@ class ProfileStats(object): ...@@ -185,20 +189,27 @@ class ProfileStats(object):
"""dict op -> total number of flops""" """dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand # timing is stored by node, we compute timing by Op on demand
rval = {} rval = {}
return rval #TODO: continue here return rval # TODO: continue here
for node, count in self.apply_callcount.items(): for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0) rval.setdefault(node.op, 0)
rval[node.op] += 1 rval[node.op] += 1
return rval return rval
for a,t in op_time.items(): for a, t in self.op_time.items():
if hasattr(a,'flops'): if hasattr(a, 'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6 op_flops[a] = a.flops * op_call[a] / t / 1e6
flops_msg='' flops_msg = ''
if op_flops: if op_flops:
flops_msg=' <MFlops/s>' flops_msg = ' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!' print ('\nHACK WARNING: we print the flops for some OP, but the'
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg) ' logic don\' always work. You need to know the internal'
' of Theano to make it work correctly.'
' Otherwise don\'t use!')
print ('\nOp-wise summary:'
' <%% of local_time spent on this kind of Op>'
' <cumulative %%> <self seconds> <cumulative seconds>'
' <time per call> %s <nb_call> <nb apply> <Op name>' % (
flops_msg))
def summary_ops(self, file=sys.stderr, N=None): def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time: if self.apply_time:
...@@ -216,19 +227,21 @@ class ProfileStats(object): ...@@ -216,19 +227,21 @@ class ProfileStats(object):
op_impl = self.op_impl() op_impl = self.op_impl()
if N is None: if N is None:
N = len(self.op_flops) N = len(self.op_flops)
otimes = [(t*100/local_time, otimes = [(t * 100 / local_time,
t, t,
op, op,
op_impl.get(op, ' '), op_impl.get(op, ' '),
op_call.get(op, 0), op_call.get(op, 0),
op_apply.get(op,0)) op_apply.get(op, 0))
for op, t in op_time.items()] for op, t in op_time.items()]
otimes.sort() otimes.sort()
otimes.reverse() otimes.reverse()
tot=0 tot = 0
print >> file, 'Ops' print >> file, 'Ops'
print >> file, '---' print >> file, '---'
#print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>' #print >> file, '<% time> <cumulative %%> <apply time>,'
#print >>file, '<cumulative seconds> <time per call> <nb_call>'
#print >>file, '<Op name>'
hs = [] hs = []
# formatting string # formatting string
es = [] es = []
...@@ -263,13 +276,14 @@ class ProfileStats(object): ...@@ -263,13 +276,14 @@ class ProfileStats(object):
print >> file, header_str print >> file, header_str
for f,t,a,impl,nb_call,nb_apply in otimes[:N]: for f, t, a, impl, nb_call, nb_apply in otimes[:N]:
if nb_call == 0: if nb_call == 0:
assert t == 0 assert t == 0
continue continue
tot+=t tot += t
ftot=tot*100/local_time ftot = tot * 100 / local_time
print >> file, format_str%(f,ftot,t,t/nb_call, impl, nb_call, print >> file, format_str % (f, ftot, t, t / nb_call,
impl, nb_call,
nb_apply, str(a)[:maxlen]) nb_apply, str(a)[:maxlen])
# While this carries over less information, it is arranged such # While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the # that it way more readeable that the previous output of the
...@@ -281,7 +295,7 @@ class ProfileStats(object): ...@@ -281,7 +295,7 @@ class ProfileStats(object):
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % ( # print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a) # f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\ print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-N), % (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]), sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:])) sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, '' print >> file, ''
...@@ -333,7 +347,7 @@ class ProfileStats(object): ...@@ -333,7 +347,7 @@ class ProfileStats(object):
print >> file, header_str print >> file, header_str
atimes = [( atimes = [(
t*100/local_time, t * 100 / local_time,
t, t,
a, a,
a.env.toposort().index(a), a.env.toposort().index(a),
...@@ -341,13 +355,13 @@ class ProfileStats(object): ...@@ -341,13 +355,13 @@ class ProfileStats(object):
for a, t in self.apply_time.items()] for a, t in self.apply_time.items()]
atimes.sort() atimes.sort()
atimes.reverse() atimes.reverse()
tot=0 tot = 0
for (f, t, a, nd_id, nb_call) in atimes[:N]: for (f, t, a, nd_id, nb_call) in atimes[:N]:
tot+=t tot += t
ftot=tot*100/local_time ftot = tot * 100 / local_time
if nb_call==0: if nb_call == 0:
continue continue
print >> file, format_str %(f,ftot, t, t/nb_call, nb_call, print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
nd_id, nd_id,
str(a)[:maxlen]) str(a)[:maxlen])
# Same as before, this I've sacrificied some information making # Same as before, this I've sacrificied some information making
...@@ -355,7 +369,7 @@ class ProfileStats(object): ...@@ -355,7 +369,7 @@ class ProfileStats(object):
#print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%( #print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
# f, ftot, t, tot, t/nb_call,nb_call, str(a)) # f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\ print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-N), % (max(0, len(atimes) - N),
sum(f for f, t, a, nd_id, nb_call in atimes[N:]), sum(f for f, t, a, nd_id, nb_call in atimes[N:]),
sum(t for f, t, a, nd_id, nb_call in atimes[N:])) sum(t for f, t, a, nd_id, nb_call in atimes[N:]))
print >> file, '' print >> file, ''
...@@ -363,31 +377,34 @@ class ProfileStats(object): ...@@ -363,31 +377,34 @@ class ProfileStats(object):
def summary_function(self, file): def summary_function(self, file):
print >> file, 'Function profiling' print >> file, 'Function profiling'
print >> file, '==================' print >> file, '=================='
print >> file, ' Message: %s'%self.message print >> file, ' Message: %s' % self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % ( print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time) self.fct_callcount, self.fct_call_time)
if self.fct_call_time>0: if self.fct_call_time > 0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' %( print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' % (
self.vm_call_time, 100*self.vm_call_time / self.fct_call_time) self.vm_call_time,
100 * self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
if local_time > 0: if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' %( print >> file, ' Time in thunks: %es (%.3f%%)' % (
local_time, 100*local_time / self.fct_call_time) local_time, 100*local_time / self.fct_call_time)
print >> file, ' Total compile time: %es' % self.compile_time print >> file, ' Total compile time: %es' % self.compile_time
print >> file, ' Theano Optimizer time: %es' % self.optimizer_time print >> file, ' Theano Optimizer time: %es' % self.optimizer_time
print >> file, ' Theano Linker time (includes C, CUDA code generation/compiling): %es' % self.linker_time print >> file, (' Theano Linker time (includes C,'
' CUDA code generation/compiling): %es' %
self.linker_time)
print >> file, '' print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20,
def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20): n_applies_to_print=20):
self.summary_function(file) self.summary_function(file)
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
if local_time > 0: if local_time > 0:
self.summary_ops(file, n_ops_to_print) self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print) self.summary_nodes(file, n_applies_to_print)
else: else:
print >> file, " No node time accumulated (hint: try config profiling.time_thunks=1)" print >> file, (" No node time accumulated "
"(hint: try config profiling.time_thunks=1)")
if 0: # old code still to be ported from ProfileMode if 0: # old code still to be ported from ProfileMode
...@@ -404,47 +421,50 @@ if 0: # old code still to be ported from ProfileMode ...@@ -404,47 +421,50 @@ if 0: # old code still to be ported from ProfileMode
print '' print ''
print 'ProfileMode.long_print()' print 'ProfileMode.long_print()'
print 'name = %s'%fct_name print 'name = %s' % fct_name
print 'msg = %s'%message print 'msg = %s' % message
print '---------------------------' print '---------------------------'
print '' print ''
print 'Total time spent running thunks: %.3fs'% local_time print 'Total time spent running thunks: %.3fs' % local_time
sop_time={} sop_time = {}
sop_call={} sop_call = {}
sop_op = {} sop_op = {}
sop_c={} #map each op class to Bool. True iff all applies were done in c. #map each op class to Bool. True iff all applies were done in c.
for a,t in op_time.items(): sop_c = {}
for a, t in op_time.items():
typ = type(a) typ = type(a)
sop_time.setdefault(typ,0) sop_time.setdefault(typ, 0)
sop_time[typ]+=t sop_time[typ] += t
sop_op.setdefault(typ,0) sop_op.setdefault(typ, 0)
sop_op[typ]+=1 sop_op[typ] += 1
sop_c.setdefault(typ,True) sop_c.setdefault(typ, True)
sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False) sop_c[typ] = sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ]=sop_call.get(typ,0)+op_call[a] sop_call[typ] = sop_call.get(typ, 0) + op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>' print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()] sotimes = [(t * 100 / local_time, t, a, sop_c[a],
sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes.sort() sotimes.sort()
sotimes.reverse() sotimes.reverse()
tot=0 tot = 0
for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]: for f, t, a, ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
if nb_call == 0: if nb_call == 0:
assert t == 0 assert t == 0
continue continue
tot+=t tot += t
ftot=tot*100/local_time ftot = tot * 100 / local_time
if ci: if ci:
msg = '*' msg = '*'
else: else:
msg = ' ' msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a) print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print), % (max(0, len(sotimes) - n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]), sum(f for f, t, a, ci, nb_call, nb_op in
sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:])) sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in
sotimes[n_ops_to_print:]))
total_time = time.time() - import_time total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values()) total_fct_time = sum(fct_call_time.values())
...@@ -453,24 +473,32 @@ if 0: # old code still to be ported from ProfileMode ...@@ -453,24 +473,32 @@ if 0: # old code still to be ported from ProfileMode
print print
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>' print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys(): for key in fct_call.keys():
if fct_call[key]>0: if fct_call[key] > 0:
print ' %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key], print ' %4.1f%% %.3fs %.2es %d %s'%(
fct_call_time[key]/fct_call[key], fct_call[key], key.name) fct_call_time[key] / total_fct_time * 100,
fct_call_time[key],
fct_call_time[key] / fct_call[key],
fct_call[key], key.name)
else: else:
print ' NOT CALLED',key.name print ' NOT CALLED',key.name
if total_fct_time>0: if total_fct_time > 0:
time_pr_in_fct=local_time/total_fct_time*100 time_pr_in_fct = local_time / total_fct_time * 100
time_per_call=total_fct_time/total_fct_call time_per_call = total_fct_time / total_fct_call
else: else:
time_pr_in_fct=0 time_pr_in_fct = 0
time_per_call=0 time_per_call = 0
print print
print 'Time since import %.3fs'%(total_time) print 'Time since import %.3fs' % (total_time)
print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100) print 'Compile time: %.3fs %.1f%%' % (compile_time,
print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100) compile_time / total_time * 100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct) print 'Theano fct call %.3fs %.1f%%' % (total_fct_time,
total_fct_time / total_time *
100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' % (local_time,
local_time / total_time * 100,
time_pr_in_fct)
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100) print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call) print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
...@@ -479,13 +507,13 @@ if 0: # old code still to be ported from ProfileMode ...@@ -479,13 +507,13 @@ if 0: # old code still to be ported from ProfileMode
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>' print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
for fct in fct_call.keys(): for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.env.toposort()): for idx, node in enumerate(fct.maker.env.toposort()):
if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs): if any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.outputs) and not any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs]) print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]): if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu=[] cpu = []
gpu=[] gpu = []
trans=[] trans = []
for so in sotimes: for so in sotimes:
if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]: if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
trans.append(so) trans.append(so)
...@@ -493,9 +521,9 @@ if 0: # old code still to be ported from ProfileMode ...@@ -493,9 +521,9 @@ if 0: # old code still to be ported from ProfileMode
gpu.append(so) gpu.append(so)
else: else:
cpu.append(so) cpu.append(so)
sum_cpu=sum(so[1] for so in cpu) sum_cpu = sum(so[1] for so in cpu)
sum_gpu=sum(so[1] for so in gpu) sum_gpu = sum(so[1] for so in gpu)
sum_trans=sum(so[1] for so in trans) sum_trans = sum(so[1] for so in trans)
print print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%( print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
...@@ -505,7 +533,7 @@ if 0: # old code still to be ported from ProfileMode ...@@ -505,7 +533,7 @@ if 0: # old code still to be ported from ProfileMode
print "<fct name> <input name> <input type> <str input>" print "<fct name> <input name> <input type> <str input>"
for fct in fct_call.keys(): for fct in fct_call.keys():
for i in fct.input_storage: for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype=='float64': if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print fct.name, i.name, i.type, i print fct.name, i.name, i.type, i
if outputs_size: if outputs_size:
......
...@@ -4,17 +4,21 @@ Defines Linkers that deal with C implementations. ...@@ -4,17 +4,21 @@ Defines Linkers that deal with C implementations.
# Python imports # Python imports
from copy import copy from copy import copy
import re #for set_compiledir import re # for set_compiledir
import os, sys, StringIO import os
import StringIO
import sys
from itertools import izip from itertools import izip
if sys.version_info[:2] >= (2,5): if sys.version_info[:2] >= (2, 5):
import hashlib import hashlib
def hash_from_code(msg): def hash_from_code(msg):
return hashlib.md5(msg).hexdigest() return hashlib.md5(msg).hexdigest()
else: else:
import md5 import md5
def hash_from_code(msg): def hash_from_code(msg):
return md5.new(msg).hexdigest() return md5.new(msg).hexdigest()
...@@ -46,9 +50,8 @@ from compilelock import get_lock, release_lock ...@@ -46,9 +50,8 @@ from compilelock import get_lock, release_lock
import cmodule import cmodule
import logging import logging
_logger=logging.getLogger("theano.gof.cc") _logger = logging.getLogger("theano.gof.cc")
_logger.setLevel(logging.WARN) _logger.setLevel(logging.WARN)
from theano.gof.callcache import CallCache from theano.gof.callcache import CallCache
...@@ -63,37 +66,47 @@ def get_module_cache(init_args=None): ...@@ -63,37 +66,47 @@ def get_module_cache(init_args=None):
""" """
return cmodule.get_module_cache(config.compiledir, init_args=init_args) return cmodule.get_module_cache(config.compiledir, init_args=init_args)
_persistent_module_cache = None _persistent_module_cache = None
def get_persistent_module_cache(): def get_persistent_module_cache():
global _persistent_module_cache global _persistent_module_cache
if _persistent_module_cache is None: if _persistent_module_cache is None:
_persistent_module_cache = CallCache(os.path.join(config.compiledir, 'persistent_cache')) _persistent_module_cache = CallCache(os.path.join(config.compiledir,
'persistent_cache'))
return _persistent_module_cache return _persistent_module_cache
class CodeBlock: class CodeBlock:
"""WRITEME """WRITEME
Represents a computation unit composed of declare, behavior, and cleanup. Represents a computation unit composed of declare, behavior, and cleanup.
@ivar declare: C code that declares variables for use by the computation @ivar declare: C code that declares variables for use by the computation
@ivar behavior: C code that performs the computation @ivar behavior: C code that performs the computation
@ivar cleanup: C code that cleans up things allocated or incref-ed in behavior @ivar cleanup: C code that cleans up things allocated or incref-ed
in behavior
""" """
def __init__(self, declare, behavior, cleanup, sub): def __init__(self, declare, behavior, cleanup, sub):
""" """
Initialize a L{CodeBlock} with templatized declare, behavior and cleanup. Initialize a L{CodeBlock} with templatized declare, behavior
The sub parameter will be used in the other arguments' templates. sub and cleanup. The sub parameter will be used in the other
should contain a key called 'id' that maps to an identifier for this block. arguments' templates. sub should contain a key called 'id'
The identifier will be used to determine the failure code and a label that maps to an identifier for this block.
to jump to. It should also contain a key called 'failure_var' that contains The identifier will be used to determine the failure code and
the name of the variable that contains the error code. a label to jump to. It should also contain a key called
'failure_var' that contains the name of the variable that
contains the error code.
""" """
self.declare = declare self.declare = declare
self.behavior = behavior self.behavior = behavior
# the dummy is because gcc throws an error when a label's right next to a closing # the dummy is because gcc throws an error when a label's
# brace (maybe there's an ignore flag for that...) # right next to a closing brace (maybe there's an ignore flag
# we need the label even if cleanup is empty because the behavior block jumps there # for that...)
# on failure # we need the label even if cleanup is empty because the
self.cleanup = ("__label_%(id)i:\n"%sub + cleanup + "\ndouble __DUMMY_%(id)i;\n"%sub) #% sub # behavior block jumps there on failure
self.cleanup = ("__label_%(id)i:\n" % sub + cleanup +
"\ndouble __DUMMY_%(id)i;\n" % sub) # % sub
def failure_code(sub): def failure_code(sub):
...@@ -102,10 +115,10 @@ def failure_code(sub): ...@@ -102,10 +115,10 @@ def failure_code(sub):
def code_gen(blocks): def code_gen(blocks):
"""WRITEME """WRITEME From a list of L{CodeBlock} instances, returns a string
From a list of L{CodeBlock} instances, returns a string that executes them that executes them all in sequence. eg for C{(decl1, task1,
all in sequence. eg for C{(decl1, task1, cleanup1)} and C{(decl2, task2, cleanup2)} cleanup1)} and C{(decl2, task2, cleanup2)} the returned string
the returned string will be of the form:: will be of the form::
decl1 decl1
decl2 decl2
...@@ -181,10 +194,11 @@ def struct_gen(args, struct_builders, blocks, sub): ...@@ -181,10 +194,11 @@ def struct_gen(args, struct_builders, blocks, sub):
args_names = ", ".join(args) args_names = ", ".join(args)
args_decl = ", ".join(["PyObject* %s" % arg for arg in args]) args_decl = ", ".join(["PyObject* %s" % arg for arg in args])
# The following code stores the exception data in __ERROR, which is a special # The following code stores the exception data in __ERROR, which
# field of the struct. __ERROR is a list of length 3 that holds the type, the # is a special field of the struct. __ERROR is a list of length 3
# value and the traceback. After storing the error, we return the failure code # that holds the type, the value and the traceback. After storing
# so we know which code block failed. # the error, we return the failure code so we know which code
# block failed.
do_return = """ do_return = """
if (%(failure_var)s) { if (%(failure_var)s) {
// When there is a failure, this code puts the exception // When there is a failure, this code puts the exception
...@@ -213,8 +227,8 @@ def struct_gen(args, struct_builders, blocks, sub): ...@@ -213,8 +227,8 @@ def struct_gen(args, struct_builders, blocks, sub):
sub = dict(sub) sub = dict(sub)
sub.update(locals()) sub.update(locals())
# TODO: add some error checking to make sure storage_<x> are 1-element lists # TODO: add some error checking to make sure storage_<x> are
# and __ERROR is a 3-elements list. # 1-element lists and __ERROR is a 3-elements list.
struct_code = """ struct_code = """
struct %(name)s { struct %(name)s {
PyObject* __ERROR; PyObject* __ERROR;
...@@ -260,6 +274,7 @@ def get_nothing(r, name, sub): ...@@ -260,6 +274,7 @@ def get_nothing(r, name, sub):
"""WRITEME""" """WRITEME"""
return "" return ""
def get_c_declare(r, name, sub): def get_c_declare(r, name, sub):
"""WRITEME""" """WRITEME"""
pre = """ pre = """
...@@ -267,6 +282,7 @@ def get_c_declare(r, name, sub): ...@@ -267,6 +282,7 @@ def get_c_declare(r, name, sub):
""" % locals() """ % locals()
return pre + r.type.c_declare(name, sub) return pre + r.type.c_declare(name, sub)
def get_c_init(r, name, sub): def get_c_init(r, name, sub):
"""WRITEME""" """WRITEME"""
pre = "" """ pre = "" """
...@@ -275,6 +291,7 @@ def get_c_init(r, name, sub): ...@@ -275,6 +291,7 @@ def get_c_init(r, name, sub):
""" % locals() """ % locals()
return pre + r.type.c_init(name, sub) return pre + r.type.c_init(name, sub)
def get_c_extract(r, name, sub): def get_c_extract(r, name, sub):
"""WRITEME""" """WRITEME"""
pre = """ pre = """
...@@ -283,6 +300,7 @@ def get_c_extract(r, name, sub): ...@@ -283,6 +300,7 @@ def get_c_extract(r, name, sub):
""" % locals() """ % locals()
return pre + r.type.c_extract(name, sub) return pre + r.type.c_extract(name, sub)
def get_c_cleanup(r, name, sub): def get_c_cleanup(r, name, sub):
"""WRITEME""" """WRITEME"""
post = """ post = """
...@@ -290,6 +308,7 @@ def get_c_cleanup(r, name, sub): ...@@ -290,6 +308,7 @@ def get_c_cleanup(r, name, sub):
""" % locals() """ % locals()
return r.type.c_cleanup(name, sub) + post return r.type.c_cleanup(name, sub) + post
def get_c_sync(r, name, sub): def get_c_sync(r, name, sub):
"""WRITEME""" """WRITEME"""
return """ return """
...@@ -300,11 +319,13 @@ def get_c_sync(r, name, sub): ...@@ -300,11 +319,13 @@ def get_c_sync(r, name, sub):
PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s); PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s);
{Py_XDECREF(old);} {Py_XDECREF(old);}
} }
""" % dict(sync = r.type.c_sync(name, sub), name = name, **sub) """ % dict(sync=r.type.c_sync(name, sub), name=name, **sub)
def apply_policy(policy, r, name, sub): def apply_policy(policy, r, name, sub):
"""WRITEME """WRITEME
@param policy: list of functions that map a L{Variable} to a string, or a single such function @param policy: list of functions that map a L{Variable} to a string,
or a single such function
@type r: L{Variable} @type r: L{Variable}
@return: C{policy[0](r) + policy[1](r) + ...} @return: C{policy[0](r) + policy[1](r) + ...}
""" """
...@@ -316,18 +337,22 @@ def apply_policy(policy, r, name, sub): ...@@ -316,18 +337,22 @@ def apply_policy(policy, r, name, sub):
return policy(r, name, sub) return policy(r, name, sub)
def struct_variable_codeblocks(variable, policies, id, symbol_table, sub): def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
"""WRITEME """WRITEME
variable -> a Variable variable -> a Variable
policies -> a pair of tuples ((declare_policy, behavior_policy, cleanup_policy), -- at construction policies -> a pair of tuples ((declare_policy, behavior_policy,
(declare_policy, behavior_policy, cleanup_policy)) -- at execution cleanup_policy), -- at construction
the first list will produce an element of the 'struct_builders' argument in struct_gen (declare_policy, behavior_policy,
the second list will produce an element of the 'blocks' argument in struct_gen cleanup_policy)) -- at execution
the first list will produce an element of the
'struct_builders' argument in struct_gen the second
list will produce an element of the 'blocks' argument
in struct_gen
id -> the id assigned to this variable's task in the computation id -> the id assigned to this variable's task in the computation
symbol_table -> a dict that maps variables to variable names. It is not read symbol_table -> a dict that maps variables to variable names. It
by this function but a variable name for the variable is computed and added is not read by this function but a variable name for the
to the table. variable is computed and added to the table.
sub -> dictionary for use by L{CodeBlock}. sub -> dictionary for use by L{CodeBlock}.
""" """
...@@ -339,17 +364,20 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub): ...@@ -339,17 +364,20 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
sub['fail'] = failure_code(sub) sub['fail'] = failure_code(sub)
sub['py_ptr'] = "py_%s" % name sub['py_ptr'] = "py_%s" % name
sub['stor_ptr'] = "storage_%s" % name sub['stor_ptr'] = "storage_%s" % name
# struct_declare, struct_behavior, struct_cleanup, sub)
struct_builder = CodeBlock(*[apply_policy(policy, variable, name, sub) struct_builder = CodeBlock(*[apply_policy(policy, variable, name, sub)
for policy in policies[0]]+[sub]) # struct_declare, struct_behavior, struct_cleanup, sub) for policy in policies[0]] + [sub])
sub['id'] = id + 1 sub['id'] = id + 1
sub['fail'] = failure_code(sub) sub['fail'] = failure_code(sub)
sub['py_ptr'] = "py_%s" % name sub['py_ptr'] = "py_%s" % name
sub['stor_ptr'] = "storage_%s" % name sub['stor_ptr'] = "storage_%s" % name
# run_declare, run_behavior, run_cleanup, sub)
block = CodeBlock(*[apply_policy(policy, variable, name, sub) block = CodeBlock(*[apply_policy(policy, variable, name, sub)
for policy in policies[1]]+[sub]) # run_declare, run_behavior, run_cleanup, sub) for policy in policies[1]] + [sub])
return struct_builder, block return struct_builder, block
class CLinker(link.Linker): class CLinker(link.Linker):
"""WRITEME """WRITEME
...@@ -365,11 +393,12 @@ class CLinker(link.Linker): ...@@ -365,11 +393,12 @@ class CLinker(link.Linker):
def __init__(self): def __init__(self):
self.env = None self.env = None
def accept(self, env, no_recycling = []): def accept(self, env, no_recycling=[]):
"""WRITEME""" """WRITEME"""
if self.env is not None and self.env is not env: if self.env is not None and self.env is not env:
return type(self)().accept(env, no_recycling) return type(self)().accept(env, no_recycling)
#raise Exception("Cannot accept from a Linker that is already tied to another Env.") #raise Exception("Cannot accept from a Linker that is already"
# " tied to another Env.")
self.env = env self.env = env
self.fetch_variables() self.fetch_variables()
self.no_recycling = no_recycling self.no_recycling = no_recycling
...@@ -377,15 +406,21 @@ class CLinker(link.Linker): ...@@ -377,15 +406,21 @@ class CLinker(link.Linker):
def fetch_variables(self): def fetch_variables(self):
"""WRITEME """WRITEME
Fills the inputs, outputs, variables, orphans, temps and node_order fields. Fills the inputs, outputs, variables, orphans,
temps and node_order fields.
""" """
env = self.env env = self.env
self.inputs = env.inputs self.inputs = env.inputs
self.outputs = env.outputs self.outputs = env.outputs
self.variables = graph.variables(self.inputs, self.outputs) # list(env.variables) # list(env.variables)
self.variables = graph.variables(self.inputs, self.outputs)
# The orphans field is listified to ensure a consistent order. # The orphans field is listified to ensure a consistent order.
self.orphans = list(r for r in self.variables if isinstance(r, graph.Value) and r not in self.inputs) #list(env.orphans.difference(self.outputs)) #list(env.orphans.difference(self.outputs))
self.temps = list(set(self.variables).difference(self.inputs).difference(self.outputs).difference(self.orphans)) self.orphans = list(r for r in self.variables
if isinstance(r, graph.Value) and
r not in self.inputs)
self.temps = list(set(self.variables).difference(
self.inputs).difference(self.outputs).difference(self.orphans))
self.consts = [] self.consts = []
self.node_order = env.toposort() self.node_order = env.toposort()
...@@ -408,8 +443,6 @@ class CLinker(link.Linker): ...@@ -408,8 +443,6 @@ class CLinker(link.Linker):
no_recycling = self.no_recycling no_recycling = self.no_recycling
env = self.env
self.consts = [] self.consts = []
c_support_code_apply = [] c_support_code_apply = []
...@@ -429,62 +462,82 @@ class CLinker(link.Linker): ...@@ -429,62 +462,82 @@ class CLinker(link.Linker):
failure_var = "__failure" failure_var = "__failure"
id = 1 id = 1
sub = dict(failure_var = failure_var) sub = dict(failure_var=failure_var)
for variable in self.variables: for variable in self.variables:
# it might be possible to inline constant variables as C literals # it might be possible to inline constant variables as C literals
## if getattr(variable, 'constant', False): ## if getattr(variable, 'constant', False):
# policy = [[what to declare in the struct, what to do at construction, what to do at destruction], # policy = [[what to declare in the struct,
# [what to declare in each run, what to do at the beginning of each run, what to do at the end of each run]] # what to do at construction,
# what to do at destruction],
# [what to declare in each run,
# what to do at the beginning of each run,
# what to do at the end of each run]]
if variable in self.inputs: if variable in self.inputs:
# we need to extract the new inputs at each run # we need to extract the new inputs at each run
# they do not need to be relayed to Python, so we don't sync # they do not need to be relayed to Python, so we don't sync
# if isinstance(variable, Constant): # if isinstance(variable, Constant):
# raise TypeError("Inputs to CLinker cannot be Constant.", variable) # raise TypeError("Inputs to CLinker cannot be Constant.",
# variable)
policy = [[get_nothing, get_nothing, get_nothing], policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_extract, get_c_cleanup]] [get_c_declare, get_c_extract, get_c_cleanup]]
elif variable in self.orphans: elif variable in self.orphans:
if not isinstance(variable, graph.Value): if not isinstance(variable, graph.Value):
raise TypeError("All orphans to CLinker must be Value instances.", variable) raise TypeError("All orphans to CLinker must be Value"
" instances.", variable)
if isinstance(variable, graph.Constant): if isinstance(variable, graph.Constant):
try: try:
symbol[variable] = "(" + variable.type.c_literal(variable.data) + ")" symbol[variable] = ("(" + variable.type.c_literal(
variable.data) + ")")
self.consts.append(variable) self.consts.append(variable)
self.orphans.remove(variable) self.orphans.remove(variable)
continue continue
except (utils.MethodNotDefined, NotImplementedError): except (utils.MethodNotDefined, NotImplementedError):
pass pass
# orphans are not inputs so we'll just get fetch them when we initialize the struct and assume they stay the same # orphans are not inputs so we'll just get fetch them
# when we initialize the struct and assume they stay
# the same
policy = [[get_c_declare, get_c_extract, get_c_cleanup], policy = [[get_c_declare, get_c_extract, get_c_cleanup],
[get_nothing, get_nothing, get_nothing]] [get_nothing, get_nothing, get_nothing]]
elif variable in self.temps: elif variable in self.temps:
# temps don't need to be extracted from Python, so we call c_init rather than c_extract # temps don't need to be extracted from Python, so we
# they do not need to be relayed to Python, so we don't sync # call c_init rather than c_extract they do not need
# to be relayed to Python, so we don't sync
if variable.type.c_is_simple() or variable in no_recycling: if variable.type.c_is_simple() or variable in no_recycling:
policy = [[get_nothing, get_nothing, get_nothing], policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_init, get_c_cleanup]] [get_c_declare, get_c_init, get_c_cleanup]]
else: else:
# it is useful for complex temps to reuse storage at each run, so we only clean up in the destructor # it is useful for complex temps to reuse storage
# at each run, so we only clean up in the
# destructor
policy = [[get_c_declare, get_c_init, get_c_cleanup], policy = [[get_c_declare, get_c_init, get_c_cleanup],
[get_nothing, get_nothing, get_nothing]] [get_nothing, get_nothing, get_nothing]]
elif variable in self.outputs: elif variable in self.outputs:
# outputs don't need to be extracted from Python, so we call c_init rather than c_extract # outputs don't need to be extracted from Python, so
# we call c_init rather than c_extract
if variable.type.c_is_simple() or variable in no_recycling: if variable.type.c_is_simple() or variable in no_recycling:
policy = [[get_nothing, get_nothing, get_nothing], policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_init, (get_c_sync, get_c_cleanup)]] [get_c_declare, get_c_init,
(get_c_sync, get_c_cleanup)]]
else: else:
# it is useful for complex outputs to reuse storage at each run, so we only clean up in the destructor # it is useful for complex outputs to reuse
# storage at each run, so we only clean up in the
# destructor
policy = [[get_c_declare, get_c_init, get_c_cleanup], policy = [[get_c_declare, get_c_init, get_c_cleanup],
[get_nothing, get_nothing, get_c_sync]] [get_nothing, get_nothing, get_c_sync]]
else: else:
raise Exception("what the fuck") raise Exception("what the fuck")
builder, block = struct_variable_codeblocks(variable, policy, id, symbol, sub) builder, block = struct_variable_codeblocks(variable, policy,
id, symbol, sub)
# each Variable generates two CodeBlocks, one to declare/initialize/destroy struct variables # each Variable generates two CodeBlocks, one to
# and the other to declare/extract/cleanup each time the function is run. # declare/initialize/destroy struct variables and the
# Typically, only one of the two actually does anything (see all the possible combinations above) # other to declare/extract/cleanup each time the function
# is run.
# Typically, only one of the two actually does anything
# (see all the possible combinations above)
init_tasks.append((variable, 'init', id)) init_tasks.append((variable, 'init', id))
init_blocks.append(builder) init_blocks.append(builder)
...@@ -496,19 +549,23 @@ class CLinker(link.Linker): ...@@ -496,19 +549,23 @@ class CLinker(link.Linker):
for node_num, node in enumerate(self.node_order): for node_num, node in enumerate(self.node_order):
# We populate sub with a mapping from the variable names specified by the op's c_var_names # We populate sub with a mapping from the variable names
# method to the actual variable names that we will use. # specified by the op's c_var_names method to the actual
# variable names that we will use.
## ivnames, ovnames = op.c_var_names() ## ivnames, ovnames = op.c_var_names()
sub = dict(failure_var = failure_var) sub = dict(failure_var=failure_var)
## for variable, vname in zip(op.inputs + op.outputs, ivnames + ovnames): ## for variable, vname in zip(op.inputs + op.outputs,
## ivnames + ovnames):
## sub[vname] = symbol[variable] ## sub[vname] = symbol[variable]
name = "node_%i" % node_num name = "node_%i" % node_num
isyms, osyms = [symbol[r] for r in node.inputs], [symbol[r] for r in node.outputs] isyms = [symbol[r] for r in node.inputs]
osyms = [symbol[r] for r in node.outputs]
# c_validate_update is deprecated # c_validate_update is deprecated
if hasattr(node.op, 'c_validate_update'): if hasattr(node.op, 'c_validate_update'):
raise Exception("c_validate_update is deprecated, move contents to c_code", node.op) raise Exception("c_validate_update is deprecated,"
" move contents to c_code", node.op)
# Make the CodeBlock for c_code # Make the CodeBlock for c_code
sub['id'] = id sub['id'] = id
...@@ -517,20 +574,23 @@ class CLinker(link.Linker): ...@@ -517,20 +574,23 @@ class CLinker(link.Linker):
op = node.op op = node.op
# type-specific support code # type-specific support code
try: try:
c_support_code_apply.append(op.c_support_code_apply(node, name)) c_support_code_apply.append(op.c_support_code_apply(node,
name))
except utils.MethodNotDefined: except utils.MethodNotDefined:
pass pass
else: else:
# The following will be executed if the "try" block succeeds # The following will be executed if the "try" block succeeds
assert isinstance(c_support_code_apply[-1], basestring), ( assert isinstance(c_support_code_apply[-1], basestring), (
str(node.op)+" didn't returned a string for c_support_code_apply") str(node.op) +
" didn't returned a string for c_support_code_apply")
# emit c_code # emit c_code
try: try:
behavior = op.c_code(node, name, isyms, osyms, sub) behavior = op.c_code(node, name, isyms, osyms, sub)
except utils.MethodNotDefined: except utils.MethodNotDefined:
raise NotImplementedError("%s cannot produce C code" % op) raise NotImplementedError("%s cannot produce C code" % op)
assert isinstance(behavior, basestring), str(node.op)+" didn't returned a string for c_code" assert isinstance(behavior, basestring), (
str(node.op) + " didn't returned a string for c_code")
try: try:
cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub) cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
...@@ -543,18 +603,24 @@ class CLinker(link.Linker): ...@@ -543,18 +603,24 @@ class CLinker(link.Linker):
tasks.append((node, 'code', id)) tasks.append((node, 'code', id))
id += 1 id += 1
# List of arg names for use in struct_gen. Note the call to uniq: duplicate inputs # List of arg names for use in struct_gen. Note the call to
# must only be passed once because they are mapped to the same name. # uniq: duplicate inputs must only be passed once because they
# Duplicates are defined by (a is b), rather than (a==b) since Constant instances can # are mapped to the same name. Duplicates are defined by (a
# is b), rather than (a==b) since Constant instances can
# compare equal to equivalent Constant instances. # compare equal to equivalent Constant instances.
args = [] args = []
args += ["storage_%s" % symbol[variable] for variable in utils.uniq(self.inputs + self.outputs + self.orphans)] args += ["storage_%s" % symbol[variable] for variable
in utils.uniq(self.inputs + self.outputs + self.orphans)]
struct_code = struct_gen(args, init_blocks, blocks, dict(failure_var = failure_var, name = "<<<<NAME>>>>")) struct_code = struct_gen(args, init_blocks, blocks,
dict(failure_var=failure_var,
name="<<<<NAME>>>>"))
# TODO: still needed? We do not use weave anymore. # TODO: still needed? We do not use weave anymore.
# The hash calculated on the code identifies it so weave can cache properly. # The hash calculated on the code identifies it so weave can
# (the hash has to be used outside of the support code because weave does not consider changes in the support code) # cache properly. (the hash has to be used outside of the
# support code because weave does not consider changes in the
# support code)
hash = hash_from_code(struct_code) hash = hash_from_code(struct_code)
struct_name = '__struct_compiled_op_%s' % hash struct_name = '__struct_compiled_op_%s' % hash
...@@ -570,7 +636,7 @@ class CLinker(link.Linker): ...@@ -570,7 +636,7 @@ class CLinker(link.Linker):
self.init_tasks = init_tasks self.init_tasks = init_tasks
self.blocks = blocks self.blocks = blocks
self.tasks = tasks self.tasks = tasks
all = self.inputs + self.outputs + self.orphans all_info = self.inputs + self.outputs + self.orphans
self.c_support_code_apply = c_support_code_apply self.c_support_code_apply = c_support_code_apply
if (self.init_tasks, self.tasks) != self.get_init_tasks(): if (self.init_tasks, self.tasks) != self.get_init_tasks():
...@@ -582,7 +648,8 @@ class CLinker(link.Linker): ...@@ -582,7 +648,8 @@ class CLinker(link.Linker):
# List of indices that should be ignored when passing the arguments # List of indices that should be ignored when passing the arguments
# (basically, everything that the previous call to uniq eliminated) # (basically, everything that the previous call to uniq eliminated)
self.dupidx = [i for i, x in enumerate(all) if all.count(x) > 1 and all.index(x) != i] self.dupidx = [i for i, x in enumerate(all_info)
if all_info.count(x) > 1 and all_info.index(x) != i]
return self.struct_code return self.struct_code
def support_code(self): def support_code(self):
...@@ -595,9 +662,12 @@ class CLinker(link.Linker): ...@@ -595,9 +662,12 @@ class CLinker(link.Linker):
""" """
ret = [] ret = []
# generic support code # generic support code
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret.append(x.c_support_code()) y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret.append(x.c_support_code())
except utils.MethodNotDefined:
pass
return ret return ret
def compile_args(self): def compile_args(self):
...@@ -608,33 +678,43 @@ class CLinker(link.Linker): ...@@ -608,33 +678,43 @@ class CLinker(link.Linker):
This might contain duplicates. This might contain duplicates.
""" """
ret = ["-O3"] ret = ["-O3"]
# this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable some of them. Putting -ffast-math would make it disable all other parameter at the same time. # this is the param the -ffast-math activate. I put the explicitly as
# FillMissing must disable some of them. Putting -ffast-math would
# make it disable all other parameter at the same time.
ret += ["-fno-math-errno", ret += ["-fno-math-errno",
#"-funsafe-math-optimizations", #"-funsafe-math-optimizations",
#"-fno-signaling-nans", #"-fno-signaling-nans",
#"-fcx-limited-range", #"-fcx-limited-range",
#"-fno-rounding-math", #"-fno-rounding-math",
#"-ffinite-math-only", #"-ffinite-math-only",
"-Wno-unused-label",#the current code generate label event if they are not used. Could use gcc attribute for those label only
"-Wno-unused-variable",#idem as the precedent #the current code generate label event if they are not used.
"-Wno-write-strings",#generated by our code generator... #Could use gcc attribute for those label only
"-Wno-unused-label",
"-Wno-unused-variable", # idem as the precedent
"-Wno-write-strings", # generated by our code generator...
] ]
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret += x.c_compile_args() y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret += x.c_compile_args()
except utils.MethodNotDefined:
pass
c_compiler = self.c_compiler() c_compiler = self.c_compiler()
ret += c_compiler.compile_args() ret += c_compiler.compile_args()
ret=list(set(ret))#to remove duplicate ret = list(set(ret)) # to remove duplicate
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try: try:
for i in x.c_no_compile_args(): for i in x.c_no_compile_args():
try: try:
ret.remove(i) ret.remove(i)
except ValueError: except ValueError:
pass# in case the value is not there pass # in case the value is not there
except utils.MethodNotDefined: pass except utils.MethodNotDefined:
pass
return ret return ret
def headers(self): def headers(self):
...@@ -645,14 +725,18 @@ class CLinker(link.Linker): ...@@ -645,14 +725,18 @@ class CLinker(link.Linker):
The return value will not contain duplicates. The return value will not contain duplicates.
""" """
ret = [] ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret += x.c_headers() y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret += x.c_headers()
except utils.MethodNotDefined:
pass
return list(set(ret)) return list(set(ret))
def c_compiler(self): def c_compiler(self):
c_compiler = None c_compiler = None
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
if hasattr(x, 'c_compiler'): if hasattr(x, 'c_compiler'):
x_compiler = x.c_compiler() x_compiler = x.c_compiler()
else: else:
...@@ -662,11 +746,13 @@ class CLinker(link.Linker): ...@@ -662,11 +746,13 @@ class CLinker(link.Linker):
c_compiler = x_compiler c_compiler = x_compiler
else: else:
if x_compiler and (x_compiler != c_compiler): if x_compiler and (x_compiler != c_compiler):
raise Exception('Nodes have requested specific different compilers', raise Exception('Nodes have requested specific'
' different compilers',
(c_compiler, x_compiler)) (c_compiler, x_compiler))
if (c_compiler is None): if (c_compiler is None):
return cmodule.GCC_compiler return cmodule.GCC_compiler
else: return c_compiler else:
return c_compiler
def header_dirs(self): def header_dirs(self):
"""WRITEME """WRITEME
...@@ -676,9 +762,12 @@ class CLinker(link.Linker): ...@@ -676,9 +762,12 @@ class CLinker(link.Linker):
The return value will not contain duplicates. The return value will not contain duplicates.
""" """
ret = [] ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret += x.c_header_dirs() y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret += x.c_header_dirs()
except utils.MethodNotDefined:
pass
return list(set(ret)) return list(set(ret))
def libraries(self): def libraries(self):
...@@ -689,9 +778,12 @@ class CLinker(link.Linker): ...@@ -689,9 +778,12 @@ class CLinker(link.Linker):
The return value will not contain duplicates. The return value will not contain duplicates.
""" """
ret = [] ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret += x.c_libraries() y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret += x.c_libraries()
except utils.MethodNotDefined:
pass
return list(set(ret)) return list(set(ret))
def lib_dirs(self): def lib_dirs(self):
...@@ -702,12 +794,16 @@ class CLinker(link.Linker): ...@@ -702,12 +794,16 @@ class CLinker(link.Linker):
The return value will not contain duplicates. The return value will not contain duplicates.
""" """
ret = [] ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]: for x in [y.type for y in self.variables] + [
try: ret += x.c_lib_dirs() y.op for y in self.node_order]:
except utils.MethodNotDefined: pass try:
ret += x.c_lib_dirs()
except utils.MethodNotDefined:
pass
return list(set(ret)) return list(set(ret))
def __compile__(self, input_storage = None, output_storage = None, keep_lock=False): def __compile__(self, input_storage=None,
output_storage=None, keep_lock=False):
"""WRITEME """WRITEME
Compiles this linker's env. Compiles this linker's env.
...@@ -737,33 +833,37 @@ class CLinker(link.Linker): ...@@ -737,33 +833,37 @@ class CLinker(link.Linker):
input_storage, input_storage,
output_storage, output_storage,
keep_lock=keep_lock) keep_lock=keep_lock)
return thunk, \ return (thunk,
[link.Container(input, storage) for input, storage in izip(self.env.inputs, input_storage)], \ [link.Container(input, storage) for input, storage in
[link.Container(output, storage, True) for output, storage in izip(self.env.outputs, output_storage)], \ izip(self.env.inputs, input_storage)],
error_storage [link.Container(output, storage, True) for output, storage in
izip(self.env.outputs, output_storage)],
error_storage)
def get_init_tasks(self): def get_init_tasks(self):
init_tasks = [] init_tasks = []
tasks = [] tasks = []
id=1 id = 1
for v in self.variables: for v in self.variables:
if v in self.consts: if v in self.consts:
continue continue
if v in self.orphans and isinstance(v, graph.Constant): if v in self.orphans and isinstance(v, graph.Constant):
try: try:
v.type.c_literal(v.data) #constant will be inlined, no need to get # constant will be inlined, no need to get
v.type.c_literal(v.data)
continue continue
except (utils.MethodNotDefined, NotImplementedError): except (utils.MethodNotDefined, NotImplementedError):
pass pass
init_tasks.append((v, 'init', id)) init_tasks.append((v, 'init', id))
tasks.append((v, 'get', id+1)) tasks.append((v, 'get', id + 1))
id += 2 id += 2
for node in self.node_order: for node in self.node_order:
tasks.append((node, 'code', id)) tasks.append((node, 'code', id))
id += 1 id += 1
return init_tasks, tasks return init_tasks, tasks
def make_thunk(self, input_storage = None, output_storage = None, keep_lock=False): def make_thunk(self, input_storage=None, output_storage=None,
keep_lock=False):
"""WRITEME """WRITEME
Compiles this linker's env and returns a function to perform the Compiles this linker's env and returns a function to perform the
computations, as well as lists of storage cells for both the computations, as well as lists of storage cells for both the
...@@ -787,16 +887,18 @@ class CLinker(link.Linker): ...@@ -787,16 +887,18 @@ class CLinker(link.Linker):
first_output = ostor[0].data first_output = ostor[0].data
""" """
init_tasks, tasks = self.get_init_tasks() init_tasks, tasks = self.get_init_tasks()
cthunk, in_storage, out_storage, error_storage = self.__compile__(input_storage, output_storage, cthunk, in_storage, out_storage, error_storage = self.__compile__(
input_storage, output_storage,
keep_lock=keep_lock) keep_lock=keep_lock)
res = _CThunk(cthunk, init_tasks, tasks, error_storage), in_storage, out_storage
return res res = _CThunk(cthunk, init_tasks, tasks, error_storage)
return res, in_storage, out_storage
def cmodule_key(self): def cmodule_key(self):
"""Return a complete hashable signature of the module we compiled. """Return a complete hashable signature of the module we compiled.
This function must have the property that no two programs that compute different things This function must have the property that no two programs that
yield the same key. compute different things yield the same key.
The key returned by this function is of the form (version, signature) The key returned by this function is of the form (version, signature)
The signature has the following form: The signature has the following form:
...@@ -817,8 +919,9 @@ class CLinker(link.Linker): ...@@ -817,8 +919,9 @@ class CLinker(link.Linker):
It is followed by elements for every node in the It is followed by elements for every node in the
topological ordering of `self.env`. topological ordering of `self.env`.
If the Op of any Apply in the Env does not have c_code_cache_ok()==True, then this If the Op of any Apply in the Env does not have
function raises a KeyError exception. c_code_cache_ok()==True, then this function raises a KeyError
exception.
Input Signature Input Signature
--------------- ---------------
...@@ -828,10 +931,13 @@ class CLinker(link.Linker): ...@@ -828,10 +931,13 @@ class CLinker(link.Linker):
type of the node input, and the nature of that input in the type of the node input, and the nature of that input in the
graph. graph.
The nature of a typical variable is encoded by integer pairs ``((a,b),c)``: The nature of a typical variable is encoded by integer pairs
``a`` is the topological position of the input's owner (-1 for graph inputs), ``((a,b),c)``:
``a`` is the topological position of the input's owner
(-1 for graph inputs),
``b`` is the index of the variable in the owner's output list. ``b`` is the index of the variable in the owner's output list.
``c`` is a flag indicating whether the variable is in the no_recycling set. ``c`` is a flag indicating whether the variable is in the
no_recycling set.
If a variable is also a graph output, then its position in the If a variable is also a graph output, then its position in the
outputs list is also bundled with this tuple (after the b). outputs list is also bundled with this tuple (after the b).
...@@ -865,6 +971,7 @@ class CLinker(link.Linker): ...@@ -865,6 +971,7 @@ class CLinker(link.Linker):
libraries=self.libraries(), libraries=self.libraries(),
header_dirs=self.header_dirs(), header_dirs=self.header_dirs(),
) )
@staticmethod @staticmethod
def cmodule_key_(env, no_recycling, compile_args=[], libraries=[], def cmodule_key_(env, no_recycling, compile_args=[], libraries=[],
header_dirs=[], insert_config_md5=True): header_dirs=[], insert_config_md5=True):
...@@ -876,7 +983,8 @@ class CLinker(link.Linker): ...@@ -876,7 +983,8 @@ class CLinker(link.Linker):
#set of variables that have been computed by nodes we have #set of variables that have been computed by nodes we have
# seen 'so far' in the loop below # seen 'so far' in the loop below
env_computed_set = set() env_computed_set = set()
env_inputs_dict = dict((i, (-1, pos)) for pos, i in enumerate(env.inputs)) env_inputs_dict = dict((i, (-1, pos)) for pos, i in
enumerate(env.inputs))
constant_ids = dict() constant_ids = dict()
op_pos = {} # Apply -> topological position op_pos = {} # Apply -> topological position
...@@ -912,6 +1020,7 @@ class CLinker(link.Linker): ...@@ -912,6 +1020,7 @@ class CLinker(link.Linker):
sig.append('md5: <omitted>') sig.append('md5: <omitted>')
error_on_play = [False] error_on_play = [False]
def in_sig(i, topological_pos, i_idx): def in_sig(i, topological_pos, i_idx):
# assert that every input to every node is one of' # assert that every input to every node is one of'
# - an env input # - an env input
...@@ -920,7 +1029,7 @@ class CLinker(link.Linker): ...@@ -920,7 +1029,7 @@ class CLinker(link.Linker):
# It is important that a variable (i) # It is important that a variable (i)
# yield a 'position' that reflects its role in code_gen() # yield a 'position' that reflects its role in code_gen()
if isinstance(i, graph.Constant): #orphans if isinstance(i, graph.Constant): # orphans
if id(i) not in constant_ids: if id(i) not in constant_ids:
isig = (i.signature(), topological_pos, i_idx) isig = (i.signature(), topological_pos, i_idx)
# If the Theano constant provides a strong hash # If the Theano constant provides a strong hash
...@@ -933,7 +1042,8 @@ class CLinker(link.Linker): ...@@ -933,7 +1042,8 @@ class CLinker(link.Linker):
isig = (isig[0].theano_hash(), topological_pos, i_idx) isig = (isig[0].theano_hash(), topological_pos, i_idx)
try: try:
hash(isig) hash(isig)
except Exception: #generic constants don't have a hashable signature except Exception:
#generic constants don't have a hashable signature
error_on_play[0] = True error_on_play[0] = True
return None return None
constant_ids[id(i)] = isig constant_ids[id(i)] = isig
...@@ -941,13 +1051,15 @@ class CLinker(link.Linker): ...@@ -941,13 +1051,15 @@ class CLinker(link.Linker):
isig = constant_ids[id(i)] isig = constant_ids[id(i)]
#print 'SIGNATURE', i.signature() #print 'SIGNATURE', i.signature()
#return i.signature() #return i.signature()
elif i in env_inputs_dict: #inputs elif i in env_inputs_dict: # inputs
isig = env_inputs_dict[i] isig = env_inputs_dict[i]
else: else:
if i.owner is None: if i.owner is None:
assert all( all(out is not None for out in o.outputs) for o in order) assert all(all(out is not None for out in o.outputs)
assert all( input.owner is None for input in env.inputs) for o in order)
raise Exception('what is this?', (i, type(i), i.clients, env)) assert all(input.owner is None for input in env.inputs)
raise Exception('what is this?', (i, type(i), i.clients,
env))
if i in env.outputs: if i in env.outputs:
isig = (op_pos[i.owner], # outputs isig = (op_pos[i.owner], # outputs
...@@ -973,7 +1085,7 @@ class CLinker(link.Linker): ...@@ -973,7 +1085,7 @@ class CLinker(link.Linker):
sig.append(( sig.append((
node.op, node.op,
tuple((i.type, in_sig(i, node_pos, ipos)) tuple((i.type, in_sig(i, node_pos, ipos))
for ipos,i in enumerate(node.inputs)), for ipos, i in enumerate(node.inputs)),
tuple(o in no_recycling for o in node.outputs))) tuple(o in no_recycling for o in node.outputs)))
if error_on_play[0]: if error_on_play[0]:
...@@ -1026,7 +1138,9 @@ class CLinker(link.Linker): ...@@ -1026,7 +1138,9 @@ class CLinker(link.Linker):
if compiler_name == 'NVCC_compiler' and config.lib.amdlibm: if compiler_name == 'NVCC_compiler' and config.lib.amdlibm:
# This lib does not work correctly with nvcc in device code. # This lib does not work correctly with nvcc in device code.
# and newer version of g++ as 4.5.1. # and newer version of g++ as 4.5.1.
# example of errors: "/usr/lib/gcc/x86_64-redhat-linux/4.5.1/include/mmintrin.h(49): error: identifier "__builtin_ia32_emms" is undefined" # example of errors: "/usr/lib/gcc/x86_64-redhat-linux/4.5.1/
# include/mmintrin.h(49): error: identifier
# "__builtin_ia32_emms" is undefined"
if '<amdlibm.h>' in mod.includes: if '<amdlibm.h>' in mod.includes:
mod.includes.remove('<amdlibm.h>') mod.includes.remove('<amdlibm.h>')
...@@ -1057,7 +1171,8 @@ class CLinker(link.Linker): ...@@ -1057,7 +1171,8 @@ class CLinker(link.Linker):
yield module yield module
def build_dynamic_module(self): def build_dynamic_module(self):
"""Return a cmodule.DynamicModule instance full of the code for our env. """Return a cmodule.DynamicModule instance full of the code
for our env.
""" """
self.code_gen() self.code_gen()
module_name = self.hash module_name = self.hash
...@@ -1065,13 +1180,16 @@ class CLinker(link.Linker): ...@@ -1065,13 +1180,16 @@ class CLinker(link.Linker):
mod = cmodule.DynamicModule(module_name) mod = cmodule.DynamicModule(module_name)
# The code of instantiate # The code of instantiate
code = self.instantiate_code(1+len(self.args)) #the 1 is for error_storage # the 1 is for error_storage
instantiate = cmodule.ExtFunction('instantiate', code, method=cmodule.METH_VARARGS) code = self.instantiate_code(1 + len(self.args))
instantiate = cmodule.ExtFunction('instantiate', code,
method=cmodule.METH_VARARGS)
#['error_storage'] + argnames, #['error_storage'] + argnames,
#local_dict = d, #local_dict = d,
#global_dict = {}) #global_dict = {})
# Static methods that can run and destroy the struct built by instantiate. # Static methods that can run and destroy the struct built by
# instantiate.
static = """ static = """
int %(struct_name)s_executor(%(struct_name)s* self) { int %(struct_name)s_executor(%(struct_name)s* self) {
return self->run(); return self->run();
...@@ -1086,7 +1204,7 @@ class CLinker(link.Linker): ...@@ -1086,7 +1204,7 @@ class CLinker(link.Linker):
//printf("done cleanup\\n"); //printf("done cleanup\\n");
//fflush(stdout); //fflush(stdout);
} }
""" % dict(struct_name = self.struct_name) """ % dict(struct_name=self.struct_name)
# We add all the support code, compile args, headers and libs we need. # We add all the support code, compile args, headers and libs we need.
for support_code in self.support_code() + self.c_support_code_apply: for support_code in self.support_code() + self.c_support_code_apply:
...@@ -1099,8 +1217,8 @@ class CLinker(link.Linker): ...@@ -1099,8 +1217,8 @@ class CLinker(link.Linker):
return mod return mod
def cthunk_factory(self, error_storage, in_storage, out_storage,
def cthunk_factory(self, error_storage, in_storage, out_storage, keep_lock=False): keep_lock=False):
"""WRITEME """WRITEME
error_storage -> list of length 3 error_storage -> list of length 3
in_storage -> list of lists of length 1, one per input in_storage -> list of lists of length 1, one per input
...@@ -1120,18 +1238,22 @@ class CLinker(link.Linker): ...@@ -1120,18 +1238,22 @@ class CLinker(link.Linker):
# If we can't get a key, then forget the cache mechanism. # If we can't get a key, then forget the cache mechanism.
module = self.compile_cmodule() module = self.compile_cmodule()
else: else:
module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock) module = get_module_cache().module_from_key(
key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)
vars = self.inputs + self.outputs + self.orphans vars = self.inputs + self.outputs + self.orphans
# List of indices that should be ignored when passing the arguments # List of indices that should be ignored when passing the arguments
# (basically, everything that the previous call to uniq eliminated) # (basically, everything that the previous call to uniq eliminated)
dupidx = [i for i, x in enumerate(vars) if vars.count(x) > 1 and vars.index(x) != i] dupidx = [i for i, x in enumerate(vars)
if vars.count(x) > 1 and vars.index(x) != i]
out_storage = [x for i, x in enumerate(out_storage) if (i+len(in_storage)) not in dupidx] out_storage = [x for i, x in enumerate(out_storage)
if (i + len(in_storage)) not in dupidx]
in_storage = [x for i, x in enumerate(in_storage) if i not in dupidx] in_storage = [x for i, x in enumerate(in_storage) if i not in dupidx]
orphd = [[orphan.data] for orphan in self.orphans] orphd = [[orphan.data] for orphan in self.orphans]
ret = module.instantiate(error_storage, *(in_storage + out_storage + orphd)) ret = module.instantiate(error_storage, *(in_storage + out_storage +
orphd))
return ret return ret
...@@ -1150,6 +1272,7 @@ class CLinker(link.Linker): ...@@ -1150,6 +1272,7 @@ class CLinker(link.Linker):
print >> code, " return thunk; }" print >> code, " return thunk; }"
return code.getvalue() return code.getvalue()
class _CThunk(object): class _CThunk(object):
""" """
A thunk with a C implementation A thunk with a C implementation
...@@ -1181,7 +1304,8 @@ class _CThunk(object): ...@@ -1181,7 +1304,8 @@ class _CThunk(object):
n = len(self.init_tasks) n = len(self.init_tasks)
# note that the failure code is distributed in two lists # note that the failure code is distributed in two lists
if failure_code < 2 * n: if failure_code < 2 * n:
return [self.init_tasks, self.tasks][failure_code % 2][failure_code/2] return [self.init_tasks, self.tasks][
failure_code % 2][failure_code / 2]
else: else:
return self.tasks[failure_code - n] return self.tasks[failure_code - n]
...@@ -1199,19 +1323,16 @@ class _CThunk(object): ...@@ -1199,19 +1323,16 @@ class _CThunk(object):
exc_value = exc_type(_exc_value, task, task.outputs) exc_value = exc_type(_exc_value, task, task.outputs)
else: else:
exc_value = exc_type(_exc_value, task) exc_value = exc_type(_exc_value, task)
exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared # this can be used to retrieve the location the Op was declared
exc_value.__thunk_trace__ = trace
except Exception: except Exception:
print >> sys.stderr, 'ERROR retrieving error_storage', self.error_storage print >> sys.stderr, 'ERROR retrieving error_storage',
print >> sys.stderr, self.error_storage
raise raise
raise exc_type, exc_value, exc_trace raise exc_type, exc_value, exc_trace
class OpWiseCLinker(link.LocalLinker): class OpWiseCLinker(link.LocalLinker):
"""WRITEME """WRITEME
Uses CLinker on the individual Ops that comprise an env and loops Uses CLinker on the individual Ops that comprise an env and loops
...@@ -1227,27 +1348,30 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -1227,27 +1348,30 @@ class OpWiseCLinker(link.LocalLinker):
If a Variable is in no_recycling, CLinker will clear the output storage If a Variable is in no_recycling, CLinker will clear the output storage
associated to it prior to computation (to avoid reusing it). associated to it prior to computation (to avoid reusing it).
:note: This is in a sense the 'default' linker for Theano. The overhead of using the :note: This is in a sense the 'default' linker for Theano. The
OpWiseCLinker as compared with the CLinker is only noticeable for graphs of very small overhead of using the OpWiseCLinker as compared with the CLinker
tensors (such as 20 elements or less) is only noticeable for graphs of very small tensors (such as 20
elements or less)
""" """
__cache__ = {} __cache__ = {}
def __init__(self, def __init__(self,
fallback_on_perform = True, fallback_on_perform=True,
allow_gc = True, allow_gc=True,
nice_errors = True): nice_errors=True):
self.env = None self.env = None
self.fallback_on_perform = fallback_on_perform self.fallback_on_perform = fallback_on_perform
self.nice_errors = nice_errors self.nice_errors = nice_errors
self.allow_gc = allow_gc self.allow_gc = allow_gc
def accept(self, env, no_recycling = []): def accept(self, env, no_recycling=[]):
if self.env is not None and self.env is not env: if self.env is not None and self.env is not env:
return type(self)(self.fallback_on_perform).accept(env, no_recycling) return type(self)(self.fallback_on_perform).accept(env,
#raise Exception("Cannot accept from a Linker that is already tied to another Env.") no_recycling)
#raise Exception("Cannot accept from a Linker that is
#already tied to another Env.")
self.env = env self.env = env
self.no_recycling = no_recycling self.no_recycling = no_recycling
return self return self
......
...@@ -5,14 +5,12 @@ import logging ...@@ -5,14 +5,12 @@ import logging
import sys import sys
import time import time
import link import link
import traceback
from theano.gof.python25 import all from theano.gof.python25 import all
import theano import theano
config = theano.config config = theano.config
from theano.configparser import config, AddConfigVar, BoolParam from theano.configparser import config, AddConfigVar, BoolParam
from theano import config
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -33,13 +31,13 @@ class VM(object): ...@@ -33,13 +31,13 @@ class VM(object):
number of times thunks[i] was called in the course of computations number of times thunks[i] was called in the course of computations
performed by call_with_timers(). performed by call_with_timers().
call_times - list of floats, one for each thunk. call_times[i] is the amount call_times - list of floats, one for each thunk. call_times[i] is
of runtime spent on thunks[i] in the course of computations performed by the amount of runtime spent on thunks[i] in the course of
call_with_timers(). computations performed by call_with_timers().
need_update_inputs - bool. True indicates that Function.__call__ must need_update_inputs - bool. True indicates that Function.__call__
implement the feedback from output storage to input storage. False means must implement the feedback from output storage to input
it *must not* repeat that feedback. storage. False means it *must not* repeat that feedback.
""" """
def __init__(self, nodes, thunks, pre_call_clear): def __init__(self, nodes, thunks, pre_call_clear):
...@@ -58,8 +56,8 @@ class VM(object): ...@@ -58,8 +56,8 @@ class VM(object):
self.nodes = nodes self.nodes = nodes
self.thunks = thunks self.thunks = thunks
self.pre_call_clear = pre_call_clear self.pre_call_clear = pre_call_clear
self.call_counts = [0]*len(nodes) self.call_counts = [0] * len(nodes)
self.call_times = [0]*len(nodes) self.call_times = [0] * len(nodes)
self.time_thunks = False self.time_thunks = False
# This variable (self.need_update_inputs) is overshadowed by # This variable (self.need_update_inputs) is overshadowed by
...@@ -88,14 +86,15 @@ class VM(object): ...@@ -88,14 +86,15 @@ class VM(object):
def update_profile(self, profile): def update_profile(self, profile):
# accumulate into the profile object # accumulate into the profile object
for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts): for node, thunk, t, c in zip(self.nodes, self.thunks,
profile.apply_time.setdefault(node,0.0) self.call_times, self.call_counts):
profile.apply_time.setdefault(node, 0.0)
profile.apply_time[node] += t profile.apply_time[node] += t
profile.apply_callcount.setdefault(node,0) profile.apply_callcount.setdefault(node, 0)
profile.apply_callcount[node] += c profile.apply_callcount[node] += c
profile.apply_cimpl[node] = hasattr(thunk,'cthunk') profile.apply_cimpl[node] = hasattr(thunk, 'cthunk')
# clear the timer info out of the buffers # clear the timer info out of the buffers
for i in xrange(len(self.call_times)): for i in xrange(len(self.call_times)):
...@@ -113,7 +112,8 @@ class Loop(VM): ...@@ -113,7 +112,8 @@ class Loop(VM):
for cont in self.pre_call_clear: for cont in self.pre_call_clear:
cont[0] = None cont[0] = None
try: try:
for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)): for i, (thunk, node) in enumerate(zip(self.thunks,
self.nodes)):
t0 = time.time() t0 = time.time()
thunk() thunk()
t1 = time.time() t1 = time.time()
...@@ -141,13 +141,16 @@ class LoopGC(VM): ...@@ -141,13 +141,16 @@ class LoopGC(VM):
self.post_thunk_clear = post_thunk_clear self.post_thunk_clear = post_thunk_clear
if not (len(nodes) == len(thunks) == len(post_thunk_clear)): if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
raise ValueError() raise ValueError()
def __call__(self): def __call__(self):
if self.time_thunks: if self.time_thunks:
for cont in self.pre_call_clear: for cont in self.pre_call_clear:
cont[0] = None cont[0] = None
try: try:
i = 0 i = 0
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear): for thunk, node, old_storage in zip(self.thunks,
self.nodes,
self.post_thunk_clear):
t0 = time.time() t0 = time.time()
thunk() thunk()
t1 = time.time() t1 = time.time()
...@@ -162,7 +165,8 @@ class LoopGC(VM): ...@@ -162,7 +165,8 @@ class LoopGC(VM):
for cont in self.pre_call_clear: for cont in self.pre_call_clear:
cont[0] = None cont[0] = None
try: try:
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear): for thunk, node, old_storage in zip(self.thunks, self.nodes,
self.post_thunk_clear):
thunk() thunk()
for old_s in old_storage: for old_s in old_storage:
old_s[0] = None old_s[0] = None
...@@ -217,9 +221,9 @@ class Stack(VM): ...@@ -217,9 +221,9 @@ class Stack(VM):
if cl[0] is not 'output': if cl[0] is not 'output':
ls += cl[0].outputs ls += cl[0].outputs
dependencies[k] += ls dependencies[k] += ls
if config.profile: if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16} self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4,
"t64": 8, "128": 16}
atexit.register(self.atexit_print_all) atexit.register(self.atexit_print_all)
def run_thunk_of_node(self, node): def run_thunk_of_node(self, node):
...@@ -257,11 +261,13 @@ class Stack(VM): ...@@ -257,11 +261,13 @@ class Stack(VM):
last_apply_stack_len = -1 last_apply_stack_len = -1
ls = [] ls = []
while apply_stack: while apply_stack:
# Make sure something happened last time round. # Make sure something happened last time round. This is
# This is just a safety check to make sure the op is written correctly # just a safety check to make sure the op is written
# apply_stack should either decrease in length by one (a thunk successfully applied), or # correctly apply_stack should either decrease in length
# increase in length (added dependencies over and above the original). # by one (a thunk successfully applied), or increase in
# NB: this doesn't catch cycles (would be too expensive/slow), just stalls. # length (added dependencies over and above the original).
# NB: this doesn't catch cycles (would be too expensive/slow),
# just stalls.
apply_stack_len = len(apply_stack) apply_stack_len = len(apply_stack)
assert apply_stack_len != last_apply_stack_len assert apply_stack_len != last_apply_stack_len
last_apply_stack_len = apply_stack_len last_apply_stack_len = apply_stack_len
...@@ -289,8 +295,8 @@ class Stack(VM): ...@@ -289,8 +295,8 @@ class Stack(VM):
if not thunks[self.node_idx[current_apply]].lazy: if not thunks[self.node_idx[current_apply]].lazy:
# Check if all inputs are in place # Check if all inputs are in place
# If so compute thunk and remove it from the apply_stack # If so compute thunk and remove it from the apply_stack
# If not leave it in, and add to the apply_stack those that will # If not leave it in, and add to the apply_stack those
# produce you those inputs # that will produce you those inputs
if computed_ins and not computed_outs: if computed_ins and not computed_outs:
try: try:
...@@ -302,22 +308,26 @@ class Stack(VM): ...@@ -302,22 +308,26 @@ class Stack(VM):
# ?? What about inplace .. if the op is inplace # ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory! # you don't actually ask for more memory!
size = [] size = []
for (idx,o) in enumerate( for (idx, o) in enumerate(
thunks[self.node_idx[current_apply]].outputs): thunks[self.node_idx[
if not hasattr(o[0],'size'): current_apply]].outputs):
if not hasattr(o[0], 'size'):
size.append(-1) size.append(-1)
continue continue
s=o[0].size s = o[0].size
dtype = str(o[0].dtype) dtype = str(o[0].dtype)
dtype2 = dtype[-3:] dtype2 = dtype[-3:]
s *= self.memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size # KeyError here: couldn't determine
# the dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s) size.append(s)
self.outputs_size[current_apply] = size self.outputs_size[current_apply] = size
except Exception: except Exception:
raise_with_op(current_apply) raise_with_op(current_apply)
for o in current_apply.outputs: for o in current_apply.outputs:
compute_map[o][0] = 1 compute_map[o][0] = 1
# Garbage Collection -> check if anybody else uses this input # Garbage Collection -> check if anybody else uses
# this input
if self.allow_gc: if self.allow_gc:
for i in current_apply.inputs: for i in current_apply.inputs:
if (dependencies[i] and i.owner if (dependencies[i] and i.owner
...@@ -332,8 +342,11 @@ class Stack(VM): ...@@ -332,8 +342,11 @@ class Stack(VM):
elif not computed_ins: elif not computed_ins:
apply_stack.append(current_apply) apply_stack.append(current_apply)
apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner) apply_stack.extend(inp.owner for inp
apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner) in current_apply.inputs if inp.owner)
apply_stack.extend(inp.owner for inp
in current_apply.destroy_dependencies
if inp.owner)
elif not computed_outs: elif not computed_outs:
# Try and run it to see if it works # Try and run it to see if it works
...@@ -346,22 +359,26 @@ class Stack(VM): ...@@ -346,22 +359,26 @@ class Stack(VM):
if requires: if requires:
for r in requires: for r in requires:
# We are not done with this op .. # We are not done with this op .. so we added
# so we added back and see to get the inputs we are missing # back and see to get the inputs we are
# missing
apply_stack.append(current_apply) apply_stack.append(current_apply)
if current_apply.inputs[r].owner: if current_apply.inputs[r].owner:
apply_stack.append(current_apply.inputs[r].owner) apply_stack.append(current_apply.inputs[r].owner)
else: else:
if config.profile: if config.profile:
size = [] size = []
for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs): for (idx, o) in enumerate(thunks[
self.node_idx[current_apply]].outputs):
if not hasattr(o[0], 'size'): if not hasattr(o[0], 'size'):
size.append(-1) size.append(-1)
continue continue
s=o[0].size s=o[0].size
dtype = str(o[0].dtype) dtype = str(o[0].dtype)
dtype2 = dtype[-2:] dtype2 = dtype[-2:]
s *= self.memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size # KeyError here: couldn't determine the
# dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s) size.append(s)
self.outputs_size[current_apply] = size self.outputs_size[current_apply] = size
if self.allow_gc: if self.allow_gc:
...@@ -379,6 +396,7 @@ class Stack(VM): ...@@ -379,6 +396,7 @@ class Stack(VM):
try: try:
import lazylinker_c import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM): class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs) lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
...@@ -394,9 +412,9 @@ class VM_Linker(link.LocalLinker): ...@@ -394,9 +412,9 @@ class VM_Linker(link.LocalLinker):
def __init__(self, allow_gc=True, use_cloop=False, callback=None): def __init__(self, allow_gc=True, use_cloop=False, callback=None):
""" """
allow_gc - force the virtual machine to clean up unnecessary references, allow_gc - force the virtual machine to clean up unnecessary
in order to allow garbage collection on intermediate values during references, in order to allow garbage collection on
computation of a function. intermediate values during computation of a function.
use_cloop - use the C-based virtual machine if possible use_cloop - use the C-based virtual machine if possible
...@@ -411,9 +429,10 @@ class VM_Linker(link.LocalLinker): ...@@ -411,9 +429,10 @@ class VM_Linker(link.LocalLinker):
self.callback = callback self.callback = callback
self.updated_vars = {} self.updated_vars = {}
def accept(self, env, no_recycling = []): def accept(self, env, no_recycling=[]):
""" """
:param env: a PerformLinker can have accepted one Env instance at a time. :param env: a PerformLinker can have accepted one Env instance
at a time.
:param no_recycling: WRITEME :param no_recycling: WRITEME
...@@ -464,9 +483,9 @@ class VM_Linker(link.LocalLinker): ...@@ -464,9 +483,9 @@ class VM_Linker(link.LocalLinker):
nodes_idx_inv = {} nodes_idx_inv = {}
vars_idx_inv = {} vars_idx_inv = {}
for (node,i) in nodes_idx.items(): for (node, i) in nodes_idx.items():
nodes_idx_inv[i] = node nodes_idx_inv[i] = node
for (var,i) in vars_idx.items(): for (var, i) in vars_idx.items():
vars_idx_inv[i] = var vars_idx_inv[i] = var
# put storage_map and compute_map into a int-based scheme # put storage_map and compute_map into a int-based scheme
...@@ -496,8 +515,8 @@ class VM_Linker(link.LocalLinker): ...@@ -496,8 +515,8 @@ class VM_Linker(link.LocalLinker):
base_input_output_list.extend(outputs_idx) base_input_output_list.extend(outputs_idx)
# build the var owner array # build the var owner array
var_owner = [None]*len(vars_idx) var_owner = [None] * len(vars_idx)
for (var,i) in vars_idx.items(): for (var, i) in vars_idx.items():
if var.owner: if var.owner:
var_owner[i] = nodes_idx[var.owner] var_owner[i] = nodes_idx[var.owner]
...@@ -511,7 +530,7 @@ class VM_Linker(link.LocalLinker): ...@@ -511,7 +530,7 @@ class VM_Linker(link.LocalLinker):
for i, node in enumerate(nodes): for i, node in enumerate(nodes):
node_output_size.append(0) node_output_size.append(0)
prereq_var_idxs = [] prereq_var_idxs = []
for prereq_node in ords.get(node,[]): for prereq_node in ords.get(node, []):
prereq_var_idxs.extend( prereq_var_idxs.extend(
[vars_idx[v] for v in prereq_node.outputs]) [vars_idx[v] for v in prereq_node.outputs])
prereq_var_idxs = list(set(prereq_var_idxs)) prereq_var_idxs = list(set(prereq_var_idxs))
...@@ -521,8 +540,8 @@ class VM_Linker(link.LocalLinker): ...@@ -521,8 +540,8 @@ class VM_Linker(link.LocalLinker):
update_storage = [] update_storage = []
for (ivar, ovar) in updated_vars.items(): for (ivar, ovar) in updated_vars.items():
if ivar != ovar: if ivar != ovar:
update_storage.append(vars_idx[ivar]) #dst update_storage.append(vars_idx[ivar]) # dst
update_storage.append(vars_idx[ovar]) #src update_storage.append(vars_idx[ovar]) # src
c0 = sys.getrefcount(node_n_inputs) c0 = sys.getrefcount(node_n_inputs)
vm = CVM( vm = CVM(
...@@ -530,8 +549,8 @@ class VM_Linker(link.LocalLinker): ...@@ -530,8 +549,8 @@ class VM_Linker(link.LocalLinker):
thunks, thunks,
pre_call_clear, pre_call_clear,
allow_gc=self.allow_gc, allow_gc=self.allow_gc,
call_counts=[0]*len(nodes), call_counts=[0] * len(nodes),
call_times=[0.0]*len(nodes), call_times=[0.0] * len(nodes),
compute_map_list=compute_map_list, compute_map_list=compute_map_list,
storage_map_list=storage_map_list, storage_map_list=storage_map_list,
base_input_output_list=base_input_output_list, base_input_output_list=base_input_output_list,
...@@ -569,7 +588,7 @@ class VM_Linker(link.LocalLinker): ...@@ -569,7 +588,7 @@ class VM_Linker(link.LocalLinker):
) )
return vm return vm
def make_all(self, profiler = None, input_storage = None, def make_all(self, profiler=None, input_storage=None,
output_storage = None, output_storage = None,
): ):
env = self.env env = self.env
...@@ -617,4 +636,3 @@ class VM_Linker(link.LocalLinker): ...@@ -617,4 +636,3 @@ class VM_Linker(link.LocalLinker):
for output, storage in zip(env.outputs, output_storage)], for output, storage in zip(env.outputs, output_storage)],
thunks, thunks,
order) order)
...@@ -346,7 +346,9 @@ def handle_shared_float32(tf): ...@@ -346,7 +346,9 @@ def handle_shared_float32(tf):
theano.compile.shared_constructor(float32_shared_constructor) theano.compile.shared_constructor(float32_shared_constructor)
else: else:
raise NotImplementedError('removing our handler') theano.compile.shared_constructor(float32_shared_constructor, True)
assert (float32_shared_constructor not in
theano.compile.shared.constructors)
# We can't test the driver during import here as this cause circular # We can't test the driver during import here as this cause circular
# import dependency. So we also test it in the file theano/__init__.py # import dependency. So we also test it in the file theano/__init__.py
......
from theano.sparse.basic import * # To facilitate later merge into sparse module from theano.sparse.basic import * # To facilitate later merge into sparse module
from theano.sparse.basic import _is_sparse, _is_sparse_variable, \ from theano.sparse.basic import (
_is_dense_variable, _is_sparse, _is_dense, _kmap_eq, _kmap_hash _is_sparse, _is_sparse_variable, _is_dense_variable,
_is_sparse, _is_dense, _kmap_eq, _kmap_hash)
class Cast(gof.op.Op): class Cast(gof.op.Op):
def __init__(self, out_type): def __init__(self, out_type):
self.out_type = out_type self.out_type = out_type
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) and self.out_type == other.out_type return (type(self) == type(other)) and self.out_type == other.out_type
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.out_type) return hash(type(self)) ^ hash(self.out_type)
def make_node(self, x): def make_node(self, x):
x = as_sparse_variable(x) x = as_sparse_variable(x)
return gof.Apply(self, [x], return gof.Apply(self, [x],
[SparseType(dtype=self.out_type, format=x.format).make_variable()]) [SparseType(dtype=self.out_type, format=x.format).make_variable()])
def perform(self, node, (x, ), (out, )): def perform(self, node, (x, ), (out, )):
assert _is_sparse(x) assert _is_sparse(x)
out[0] = x out[0] = x
...@@ -20,31 +26,40 @@ class Cast(gof.op.Op): ...@@ -20,31 +26,40 @@ class Cast(gof.op.Op):
fcast = Cast('float32') fcast = Cast('float32')
dcast = Cast('float64') dcast = Cast('float64')
class Poisson(gof.op.Op): class Poisson(gof.op.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, x): def make_node(self, x):
x = as_sparse_variable(x) x = as_sparse_variable(x)
return gof.Apply(self, [x], [x.type()]) return gof.Apply(self, [x], [x.type()])
def perform(self, node, (x, ), (out, )): def perform(self, node, (x, ), (out, )):
assert _is_sparse(x) assert _is_sparse(x)
out[0] = x.copy() out[0] = x.copy()
out[0].data = numpy.asarray(numpy.random.poisson(out[0].data), dtype=x.dtype) out[0].data = numpy.asarray(numpy.random.poisson(out[0].data),
dtype=x.dtype)
out[0].eliminate_zeros() out[0].eliminate_zeros()
poisson = Poisson() poisson = Poisson()
class Multinomial(gof.op.Op): class Multinomial(gof.op.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, n, p): def make_node(self, n, p):
n = tensor.as_tensor_variable(n) n = tensor.as_tensor_variable(n)
p = as_sparse_variable(p) p = as_sparse_variable(p)
return gof.Apply(self, [n, p], [p.type()]) return gof.Apply(self, [n, p], [p.type()])
def perform(self, node, (n, p), (out, )): def perform(self, node, (n, p), (out, )):
assert _is_sparse(p) assert _is_sparse(p)
...@@ -53,54 +68,68 @@ class Multinomial(gof.op.Op): ...@@ -53,54 +68,68 @@ class Multinomial(gof.op.Op):
out[0] = p.copy() out[0] = p.copy()
for i in xrange(p.shape[0]): for i in xrange(p.shape[0]):
k, l = p.indptr[i], p.indptr[i+1] k, l = p.indptr[i], p.indptr[i + 1]
out[0].data[k:l] = numpy.random.multinomial(n[i], p.data[k:l]) out[0].data[k:l] = numpy.random.multinomial(n[i], p.data[k:l])
multinomial = Multinomial() multinomial = Multinomial()
class EliminateZeros(gof.op.Op): class EliminateZeros(gof.op.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, x): def make_node(self, x):
x = as_sparse_variable(x) x = as_sparse_variable(x)
return gof.Apply(self, [x], [x.type()]) return gof.Apply(self, [x], [x.type()])
def perform(self, node, (x, ), (out, )): def perform(self, node, (x, ), (out, )):
assert _is_sparse(x) assert _is_sparse(x)
out[0] = x.copy() out[0] = x.copy()
out[0].eliminate_zeros() out[0].eliminate_zeros()
eliminate_zeros = EliminateZeros() eliminate_zeros = EliminateZeros()
class Sum(gof.op.Op): class Sum(gof.op.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, x, a): def make_node(self, x, a):
x = as_sparse_variable(x) x = as_sparse_variable(x)
a = tensor.as_tensor_variable(a) a = tensor.as_tensor_variable(a)
return gof.Apply(self, [x, a], [tensor.TensorType(dtype = x.type.dtype, return gof.Apply(self, [x, a], [tensor.TensorType(dtype=x.type.dtype,
broadcastable = (False,)).make_variable()]) broadcastable=(False,)).make_variable()])
def perform(self, node, (x, a), (out, )): def perform(self, node, (x, a), (out, )):
assert _is_sparse(x) assert _is_sparse(x)
out[0] = numpy.asarray(x.sum(a), dtype=x.dtype).flatten() out[0] = numpy.asarray(x.sum(a), dtype=x.dtype).flatten()
sum = Sum() sum = Sum()
class Binomial(gof.op.Op): class Binomial(gof.op.Op):
def __init__(self, format, dtype): def __init__(self, format, dtype):
self.format = format self.format = format
self.dtype = dtype self.dtype = dtype
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) and self.format == other.format and \ return ((type(self) == type(other)) and
self.dtype == other.dtype self.format == other.format and
self.dtype == other.dtype)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.format) ^ hash(self.dtype) return hash(type(self)) ^ hash(self.format) ^ hash(self.dtype)
def make_node(self, n, p, shape): def make_node(self, n, p, shape):
n = tensor.as_tensor_variable(n) n = tensor.as_tensor_variable(n)
p = tensor.as_tensor_variable(p) p = tensor.as_tensor_variable(p)
shape = tensor.as_tensor_variable(shape) shape = tensor.as_tensor_variable(shape)
return gof.Apply(self, [n, p, shape], [SparseType(dtype = self.dtype, return gof.Apply(self, [n, p, shape], [SparseType(dtype=self.dtype,
format = self.format).make_variable()]) format=self.format).make_variable()])
def perform(self, node, (n, p, shape, ), (out, )): def perform(self, node, (n, p, shape, ), (out, )):
N = n * p * shape[0] * shape[1] N = n * p * shape[0] * shape[1]
data = numpy.ones(N, dtype=self.dtype) data = numpy.ones(N, dtype=self.dtype)
...@@ -116,6 +145,7 @@ csc_fbinomial = Binomial('csc', 'float32') ...@@ -116,6 +145,7 @@ csc_fbinomial = Binomial('csc', 'float32')
csr_dbinomial = Binomial('csr', 'float64') csr_dbinomial = Binomial('csr', 'float64')
csc_dbinomial = Binomial('csc', 'float64') csc_dbinomial = Binomial('csc', 'float64')
def structured_sigmoid(x): def structured_sigmoid(x):
""" """
Element-wise sigmoid function only to the non-zero elements. Element-wise sigmoid function only to the non-zero elements.
...@@ -179,8 +209,10 @@ class StructuredAddSV(gof.op.Op): ...@@ -179,8 +209,10 @@ class StructuredAddSV(gof.op.Op):
matrix.''' matrix.'''
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, x, y): def make_node(self, x, y):
x = as_sparse_variable(x) x = as_sparse_variable(x)
y = tensor.as_tensor_variable(y) y = tensor.as_tensor_variable(y)
...@@ -191,12 +223,14 @@ class StructuredAddSV(gof.op.Op): ...@@ -191,12 +223,14 @@ class StructuredAddSV(gof.op.Op):
raise NotImplementedError() raise NotImplementedError()
return gof.Apply(self, return gof.Apply(self,
[x, y], [x, y],
[SparseType(dtype = x.type.dtype, [SparseType(dtype=x.type.dtype,
format = x.type.format).make_variable()]) format=x.type.format).make_variable()])
def perform(self, node, (x, y), (out, )): def perform(self, node, (x, y), (out, )):
assert _is_sparse(x) and not _is_sparse(y) assert _is_sparse(x) and not _is_sparse(y)
assert x.shape[1] == y.shape[0] assert x.shape[1] == y.shape[0]
out[0] = x.__class__(x + (x.toarray() != 0) * y) out[0] = x.__class__(x + (x.toarray() != 0) * y)
def grad(self, (x, y), (gz,)): def grad(self, (x, y), (gz,)):
assert _is_sparse_variable(x) and _is_sparse_variable(y) assert _is_sparse_variable(x) and _is_sparse_variable(y)
assert _is_sparse_variable(gz) assert _is_sparse_variable(gz)
...@@ -207,14 +241,18 @@ structured_add_s_v = StructuredAddSV() ...@@ -207,14 +241,18 @@ structured_add_s_v = StructuredAddSV()
class StrucutedAddSVCSR(gof.Op): class StrucutedAddSVCSR(gof.Op):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b): def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 1 assert b.type.ndim == 1
return gof.Apply(self, [a_data, a_indices, a_indptr, b], return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))]) [tensor.tensor(b.dtype, (False,))])
def c_code(self, node, name, (_data, _indices, _indptr, _b,), (_zout, ), sub):
def c_code(self, node, name, inputs, outputs, sub):
_data, _indices, _indptr, _b, = inputs
_zout, = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'): if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a') raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'): if node.inputs[3].type.dtype in ('complex64', 'complex128'):
...@@ -272,16 +310,17 @@ class StrucutedAddSVCSR(gof.Op): ...@@ -272,16 +310,17 @@ class StrucutedAddSVCSR(gof.Op):
} }
} }
"""% dict(locals(), **sub) """ % dict(locals(), **sub)
structured_add_s_v_csr = StrucutedAddSVCSR() structured_add_s_v_csr = StrucutedAddSVCSR()
@gof.local_optimizer([structured_add_s_v]) @gof.local_optimizer([structured_add_s_v])
def local_structured_add_s_v(node): def local_structured_add_s_v(node):
if node.op == structured_add_s_v: if node.op == structured_add_s_v:
x, y = node.inputs x, y = node.inputs
x_is_sparse_variable = _is_sparse_variable(x) x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y) #y_is_sparse_variable = _is_sparse_variable(y)
if x_is_sparse_variable: if x_is_sparse_variable:
svar = x svar = x
...@@ -310,14 +349,19 @@ register_specialize(local_structured_add_s_v) ...@@ -310,14 +349,19 @@ register_specialize(local_structured_add_s_v)
class SamplingDot(gof.op.Op): class SamplingDot(gof.op.Op):
""" """
Operand for calculating the dot product DOT(X, Y) = Z when you only want to calculate Operand for calculating the dot product DOT(X, Y) = Z when you
a subset of Z. It is equivalent to P o (X . Y) where o is the element-wise product, X and Y operands of only want to calculate a subset of Z. It is equivalent to P o (X
the dot product and P is a matrix that contains 1 when the corresponding element of Z should be calculated . Y) where o is the element-wise product, X and Y operands of the
and 0 when it shouldn't. Note that SamplingDot has a different interface than DOT because SamplingDot dot product and P is a matrix that contains 1 when the
requires X to be a MxK matrix while Y is a NxK matrix instead of the usual KxN matrix. corresponding element of Z should be calculated and 0 when it
shouldn't. Note that SamplingDot has a different interface than
It will work if the pattern is not binary value, but if the pattern doesn't have a high sparsity proportion DOT because SamplingDot requires X to be a MxK matrix while Y is a
it will be slower then a more optimized dot followed by a normal elemwise multiplication. NxK matrix instead of the usual KxN matrix.
It will work if the pattern is not binary value, but if the
pattern doesn't have a high sparsity proportion it will be slower
then a more optimized dot followed by a normal elemwise
multiplication.
""" """
def __eq__(self, other): def __eq__(self, other):
...@@ -364,6 +408,7 @@ class SamplingDot(gof.op.Op): ...@@ -364,6 +408,7 @@ class SamplingDot(gof.op.Op):
return rval return rval
sampling_dot = SamplingDot() sampling_dot = SamplingDot()
class SamplingDotCsr(gof.Op): class SamplingDotCsr(gof.Op):
""" """
Optimized SamplingDot when the pattern P is a CSR matrix. Optimized SamplingDot when the pattern P is a CSR matrix.
...@@ -391,7 +436,8 @@ class SamplingDotCsr(gof.Op): ...@@ -391,7 +436,8 @@ class SamplingDotCsr(gof.Op):
assert p_ncols.dtype == 'int32' assert p_ncols.dtype == 'int32'
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p_data.type.dtype) dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
p_data.type.dtype)
dot_out = scalar.upcast(x.type.dtype, y.type.dtype) dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
# We call blas ?dot function that take only param of the same type # We call blas ?dot function that take only param of the same type
...@@ -420,15 +466,20 @@ class SamplingDotCsr(gof.Op): ...@@ -420,15 +466,20 @@ class SamplingDotCsr(gof.Op):
def c_header_dirs(self): def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True) return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, (x, y, p_data, p_ind, p_ptr, p_ncols), (z_data, z_ind, z_ptr), sub): def c_code(self, node, name, inputs, outputs, sub):
x, y, p_data, p_ind, p_ptr, p_ncols = inputs
z_data, z_ind, z_ptr = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'): if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for x') raise NotImplementedError('Complex types are not supported for x')
if node.inputs[1].type.dtype in ('complex64', 'complex128'): if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y') raise NotImplementedError('Complex types are not supported for y')
if node.inputs[2].type.dtype in ('complex64', 'complex128'): if node.inputs[2].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for pattern') raise NotImplementedError(
'Complex types are not supported for pattern')
dot_out = scalar.upcast(node.inputs[0].type.dtype, node.inputs[0].type.dtype) # TODO: why 2 times the same inputs?
dot_out = scalar.upcast(node.inputs[0].type.dtype,
node.inputs[0].type.dtype)
if dot_out == "float32": if dot_out == "float32":
conv_type = "float" conv_type = "float"
...@@ -437,12 +488,16 @@ class SamplingDotCsr(gof.Op): ...@@ -437,12 +488,16 @@ class SamplingDotCsr(gof.Op):
conv_type = "double" conv_type = "double"
cdot = "ddot_sub_" cdot = "ddot_sub_"
typenum_x = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number # retrieve dtype number
typenum_y = node.inputs[1].type.dtype_specs()[-1] # retrieve dtype number typenum_x = node.inputs[0].type.dtype_specs()[-1]
typenum_p = node.inputs[2].type.dtype_specs()[-1] # retrieve dtype number typenum_y = node.inputs[1].type.dtype_specs()[-1]
typenum_zd = tensor.TensorType(node.outputs[0].dtype, []).dtype_specs()[-1] # retrieve dtype number typenum_p = node.inputs[2].type.dtype_specs()[-1]
typenum_zi = tensor.TensorType(node.outputs[1].dtype, []).dtype_specs()[-1] # retrieve dtype number typenum_zd = tensor.TensorType(node.outputs[0].dtype,
typenum_zp = tensor.TensorType(node.outputs[2].dtype, []).dtype_specs()[-1] # retrieve dtype number []).dtype_specs()[-1]
typenum_zi = tensor.TensorType(node.outputs[1].dtype,
[]).dtype_specs()[-1]
typenum_zp = tensor.TensorType(node.outputs[2].dtype,
[]).dtype_specs()[-1]
rval = """ rval = """
if (%(x)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;} if (%(x)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
...@@ -531,11 +586,12 @@ class SamplingDotCsr(gof.Op): ...@@ -531,11 +586,12 @@ class SamplingDotCsr(gof.Op):
} }
} }
} }
"""% dict(locals(), **sub) """ % dict(locals(), **sub)
return rval return rval
sampling_dot_csr = SamplingDotCsr() sampling_dot_csr = SamplingDotCsr()
# register a specialization to replace SamplingDot -> SamplingDotCsr # register a specialization to replace SamplingDot -> SamplingDotCsr
@gof.local_optimizer([sampling_dot]) @gof.local_optimizer([sampling_dot])
def local_sampling_dot_csr(node): def local_sampling_dot_csr(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论