提交 941ea07c authored 作者: lamblin's avatar lamblin

Merge pull request #511 from nouiz/pep8

Pep8
......@@ -28,6 +28,14 @@ New Features
anything below an intermediate variable that has a name. Defaults to False.
* debugprint does not print anymore the "|" symbol in a column after the last input.
Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people)
* They are all in the theano.sparse.sandbox.sp2 module
* Op class: Cast, Poisson, Multinomial, EliminateZeros, Sum, Binomial
* Op class: SamplingDot, SamplingDotCsr(inserted automatically)
* Op function: structured_sigmoid, structured_exp, structured_pow, structured_minimum,
* Op class: StructuredAddSV, StrucutedAddSVCSR(inserted automatically)
* opt: local_sampling_dot_csr, local_structured_add_s_v
Internal changes
* Define new exceptions MissingInputError and UnusedInputError, and use them
in theano.function, instead of TypeError and ValueError. (Pascal L.)
......
......@@ -317,7 +317,48 @@ bindings to work only on Python files.
Emacs
~~~~~
WRITEME
There is an **execellent** system to configure emacs for python:
`emacs-for-python
<https://github.com/gabrielelanaro/emacs-for-python>`_. It gatter many
emacs config into one and modify them to behave together nicely. You
can use it to check for pep8 compliance and for python syntax errors.
To install it on linux, you can do like this:
.. code-block:: bash
cd
git clone https://github.com/gabrielelanaro/emacs-for-python.git .emacs.d/emacs-for-python
Then in your ``~/.emacs`` file, add this:
.. code-block:: bash
;; Mandatory
(load-file "~/.emacs.d/emacs-for-python/epy-init.el")
(add-to-list 'load-path "~/.emacs.d/emacs-for-python/") ;; tell where to load the various files
;; Each of them enable different part of the system
;; only the 2 first are needed for pep8, syntax check.
(require 'epy-setup) ;; It will setup other loads, it is required!
(require 'epy-python) ;; If you want the python facilities [optional]
(require 'epy-completion) ;; If you want the autocompletion settings [optional]
(require 'epy-editing) ;; For configurations related to editing [optional]
;; define f10 to previous error
;; define f11 to next error
(require 'epy-bindings) ;; For my suggested keybindings [optional]
;; some shortcut that don't collide with gnome-terminal
;; otherwise, "epy-bindings" define f10 and f11 for them.
(global-set-key [f2] 'flymake-goto-prev-error)
(global-set-key [f3] 'flymake-goto-next-error)
;; next two lines are the checks to do. You can add more if you wish
(epy-setup-checker "pyflakes %f") ;; for python syntax check
(epy-setup-checker "pep8 %f") ;; for pep8 check
Unit tests
......
......@@ -7,11 +7,11 @@
# TODO: ensure field width for string fields makes columns line up
# TODO: what to do about 'diff summary'? (ask Fred?)
#
__authors__ = "James Bergstra"
__reviewer__ = "Razvan Pascanu"
__authors__ = "James Bergstra"
__reviewer__ = "Razvan Pascanu"
__copyright__ = "(c) 2011, Universite de Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev <theano-dev@googlegroups.com>"
__docformat__ = "restructuredtext en"
import atexit
......@@ -22,7 +22,7 @@ import time
import numpy
import theano
from theano.configparser import AddConfigVar, StrParam, BoolParam
from theano.configparser import AddConfigVar, BoolParam
import_time = time.time()
config = theano.config
......@@ -46,8 +46,10 @@ def _atexit_print_fn():
if len(_atexit_print_list) > 1:
# Make a global profile
cum = copy.copy(_atexit_print_list[0])
cum.message = "Sum of all Theano functions"
cum.message = "Sum of all printed profile at exit"
for ps in _atexit_print_list[1:]:
# for ps in [ps for ps in _atexit_print_list[1:]
# if not isinstance(ps, ScanProfileStats)]:
for attr in ["compile_time", "fct_call_time", "fct_callcount",
"vm_call_time", "optimizer_time", "linker_time"]:
setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
......@@ -62,6 +64,7 @@ def _atexit_print_fn():
atexit.register(_atexit_print_fn)
class ProfileStats(object):
"""
Object to store runtime and memory profiling information for all of
......@@ -119,6 +122,7 @@ class ProfileStats(object):
# time spent linking graph (FunctionMaker.create)
line_width = 140
# param is called flag_time_thunks because most other attributes with time
# in the name are times *of* something, rather than configuration flags.
def __init__(self, atexit_print=True, flag_time_thunks=None, **kwargs):
......@@ -185,20 +189,27 @@ class ProfileStats(object):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
return rval #TODO: continue here
return rval # TODO: continue here
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
for a,t in op_time.items():
if hasattr(a,'flops'):
op_flops[a]=a.flops*op_call[a]/t/1e6
for a, t in self.op_time.items():
if hasattr(a, 'flops'):
op_flops[a] = a.flops * op_call[a] / t / 1e6
flops_msg=''
flops_msg = ''
if op_flops:
flops_msg=' <MFlops/s>'
print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
flops_msg = ' <MFlops/s>'
print ('\nHACK WARNING: we print the flops for some OP, but the'
' logic don\' always work. You need to know the internal'
' of Theano to make it work correctly.'
' Otherwise don\'t use!')
print ('\nOp-wise summary:'
' <%% of local_time spent on this kind of Op>'
' <cumulative %%> <self seconds> <cumulative seconds>'
' <time per call> %s <nb_call> <nb apply> <Op name>' % (
flops_msg))
def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time:
......@@ -216,19 +227,21 @@ class ProfileStats(object):
op_impl = self.op_impl()
if N is None:
N = len(self.op_flops)
otimes = [(t*100/local_time,
otimes = [(t * 100 / local_time,
t,
op,
op_impl.get(op, ' '),
op_call.get(op, 0),
op_apply.get(op,0))
op_apply.get(op, 0))
for op, t in op_time.items()]
otimes.sort()
otimes.reverse()
tot=0
tot = 0
print >> file, 'Ops'
print >> file, '---'
#print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
#print >> file, '<% time> <cumulative %%> <apply time>,'
#print >>file, '<cumulative seconds> <time per call> <nb_call>'
#print >>file, '<Op name>'
hs = []
# formatting string
es = []
......@@ -263,14 +276,15 @@ class ProfileStats(object):
print >> file, header_str
for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
for f, t, a, impl, nb_call, nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
print >> file, format_str%(f,ftot,t,t/nb_call, impl, nb_call,
nb_apply, str(a)[:maxlen])
tot += t
ftot = tot * 100 / local_time
print >> file, format_str % (f, ftot, t, t / nb_call,
impl, nb_call,
nb_apply, str(a)[:maxlen])
# While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the
# profiler
......@@ -281,7 +295,7 @@ class ProfileStats(object):
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-N),
% (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
......@@ -333,7 +347,7 @@ class ProfileStats(object):
print >> file, header_str
atimes = [(
t*100/local_time,
t * 100 / local_time,
t,
a,
a.env.toposort().index(a),
......@@ -341,13 +355,13 @@ class ProfileStats(object):
for a, t in self.apply_time.items()]
atimes.sort()
atimes.reverse()
tot=0
tot = 0
for (f, t, a, nd_id, nb_call) in atimes[:N]:
tot+=t
ftot=tot*100/local_time
if nb_call==0:
tot += t
ftot = tot * 100 / local_time
if nb_call == 0:
continue
print >> file, format_str %(f,ftot, t, t/nb_call, nb_call,
print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
nd_id,
str(a)[:maxlen])
# Same as before, this I've sacrificied some information making
......@@ -355,7 +369,7 @@ class ProfileStats(object):
#print >> file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %i %s'%(
# f, ftot, t, tot, t/nb_call,nb_call, str(a))
print >> file, ' ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(atimes)-N),
% (max(0, len(atimes) - N),
sum(f for f, t, a, nd_id, nb_call in atimes[N:]),
sum(t for f, t, a, nd_id, nb_call in atimes[N:]))
print >> file, ''
......@@ -363,31 +377,34 @@ class ProfileStats(object):
def summary_function(self, file):
print >> file, 'Function profiling'
print >> file, '=================='
print >> file, ' Message: %s'%self.message
print >> file, ' Message: %s' % self.message
print >> file, ' Time in %i calls to Function.__call__: %es' % (
self.fct_callcount, self.fct_call_time)
if self.fct_call_time>0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' %(
self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
if self.fct_call_time > 0:
print >> file, ' Time in Function.fn.__call__: %es (%.3f%%)' % (
self.vm_call_time,
100 * self.vm_call_time / self.fct_call_time)
local_time = sum(self.apply_time.values())
if local_time > 0:
print >> file, ' Time in thunks: %es (%.3f%%)' %(
print >> file, ' Time in thunks: %es (%.3f%%)' % (
local_time, 100*local_time / self.fct_call_time)
print >> file, ' Total compile time: %es' % self.compile_time
print >> file, ' Theano Optimizer time: %es' % self.optimizer_time
print >> file, ' Theano Linker time (includes C, CUDA code generation/compiling): %es' % self.linker_time
print >> file, (' Theano Linker time (includes C,'
' CUDA code generation/compiling): %es' %
self.linker_time)
print >> file, ''
def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
def summary(self, file=sys.stderr, n_ops_to_print=20,
n_applies_to_print=20):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
else:
print >> file, " No node time accumulated (hint: try config profiling.time_thunks=1)"
print >> file, (" No node time accumulated "
"(hint: try config profiling.time_thunks=1)")
if 0: # old code still to be ported from ProfileMode
......@@ -404,47 +421,50 @@ if 0: # old code still to be ported from ProfileMode
print ''
print 'ProfileMode.long_print()'
print 'name = %s'%fct_name
print 'msg = %s'%message
print 'name = %s' % fct_name
print 'msg = %s' % message
print '---------------------------'
print ''
print 'Total time spent running thunks: %.3fs'% local_time
print 'Total time spent running thunks: %.3fs' % local_time
sop_time={}
sop_call={}
sop_time = {}
sop_call = {}
sop_op = {}
sop_c={} #map each op class to Bool. True iff all applies were done in c.
for a,t in op_time.items():
#map each op class to Bool. True iff all applies were done in c.
sop_c = {}
for a, t in op_time.items():
typ = type(a)
sop_time.setdefault(typ,0)
sop_time[typ]+=t
sop_op.setdefault(typ,0)
sop_op[typ]+=1
sop_c.setdefault(typ,True)
sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ]=sop_call.get(typ,0)+op_call[a]
sop_time.setdefault(typ, 0)
sop_time[typ] += t
sop_op.setdefault(typ, 0)
sop_op[typ] += 1
sop_c.setdefault(typ, True)
sop_c[typ] = sop_c[typ] and op_cimpl.get(a, False)
sop_call[typ] = sop_call.get(typ, 0) + op_call[a]
print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes = [(t * 100 / local_time, t, a, sop_c[a],
sop_call[a], sop_op[a]) for a, t in sop_time.items()]
sotimes.sort()
sotimes.reverse()
tot=0
for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
tot = 0
for f, t, a, ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
if nb_call == 0:
assert t == 0
continue
tot+=t
ftot=tot*100/local_time
tot += t
ftot = tot * 100 / local_time
if ci:
msg = '*'
else:
msg = ' '
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
print ' ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(sotimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
% (max(0, len(sotimes) - n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in
sotimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in
sotimes[n_ops_to_print:]))
total_time = time.time() - import_time
total_fct_time = sum(fct_call_time.values())
......@@ -453,24 +473,32 @@ if 0: # old code still to be ported from ProfileMode
print
print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
for key in fct_call.keys():
if fct_call[key]>0:
print ' %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
fct_call_time[key]/fct_call[key], fct_call[key], key.name)
if fct_call[key] > 0:
print ' %4.1f%% %.3fs %.2es %d %s'%(
fct_call_time[key] / total_fct_time * 100,
fct_call_time[key],
fct_call_time[key] / fct_call[key],
fct_call[key], key.name)
else:
print ' NOT CALLED',key.name
if total_fct_time>0:
time_pr_in_fct=local_time/total_fct_time*100
time_per_call=total_fct_time/total_fct_call
if total_fct_time > 0:
time_pr_in_fct = local_time / total_fct_time * 100
time_per_call = total_fct_time / total_fct_call
else:
time_pr_in_fct=0
time_per_call=0
time_pr_in_fct = 0
time_per_call = 0
print
print 'Time since import %.3fs'%(total_time)
print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
print 'Time since import %.3fs' % (total_time)
print 'Compile time: %.3fs %.1f%%' % (compile_time,
compile_time / total_time * 100)
print 'Theano fct call %.3fs %.1f%%' % (total_fct_time,
total_fct_time / total_time *
100)
print ' Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)' % (local_time,
local_time / total_time * 100,
time_pr_in_fct)
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
......@@ -479,13 +507,13 @@ if 0: # old code still to be ported from ProfileMode
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
for fct in fct_call.keys():
for idx, node in enumerate(fct.maker.env.toposort()):
if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
if any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.outputs) and not any(hasattr(i, 'dtype') and i.dtype == 'float64' for i in node.inputs):
print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
cpu=[]
gpu=[]
trans=[]
cpu = []
gpu = []
trans = []
for so in sotimes:
if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
trans.append(so)
......@@ -493,9 +521,9 @@ if 0: # old code still to be ported from ProfileMode
gpu.append(so)
else:
cpu.append(so)
sum_cpu=sum(so[1] for so in cpu)
sum_gpu=sum(so[1] for so in gpu)
sum_trans=sum(so[1] for so in trans)
sum_cpu = sum(so[1] for so in cpu)
sum_gpu = sum(so[1] for so in gpu)
sum_trans = sum(so[1] for so in trans)
print
print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
......@@ -505,7 +533,7 @@ if 0: # old code still to be ported from ProfileMode
print "<fct name> <input name> <input type> <str input>"
for fct in fct_call.keys():
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print fct.name, i.name, i.type, i
if outputs_size:
......
......@@ -4,17 +4,21 @@ Defines Linkers that deal with C implementations.
# Python imports
from copy import copy
import re #for set_compiledir
import os, sys, StringIO
import re # for set_compiledir
import os
import StringIO
import sys
from itertools import izip
if sys.version_info[:2] >= (2,5):
if sys.version_info[:2] >= (2, 5):
import hashlib
def hash_from_code(msg):
return hashlib.md5(msg).hexdigest()
else:
import md5
def hash_from_code(msg):
return md5.new(msg).hexdigest()
......@@ -46,14 +50,13 @@ from compilelock import get_lock, release_lock
import cmodule
import logging
_logger=logging.getLogger("theano.gof.cc")
_logger = logging.getLogger("theano.gof.cc")
_logger.setLevel(logging.WARN)
from theano.gof.callcache import CallCache
run_cthunk = None # Will be imported only when needed.
run_cthunk = None # Will be imported only when needed.
def get_module_cache(init_args=None):
......@@ -63,37 +66,47 @@ def get_module_cache(init_args=None):
"""
return cmodule.get_module_cache(config.compiledir, init_args=init_args)
_persistent_module_cache = None
def get_persistent_module_cache():
global _persistent_module_cache
if _persistent_module_cache is None:
_persistent_module_cache = CallCache(os.path.join(config.compiledir, 'persistent_cache'))
_persistent_module_cache = CallCache(os.path.join(config.compiledir,
'persistent_cache'))
return _persistent_module_cache
class CodeBlock:
"""WRITEME
Represents a computation unit composed of declare, behavior, and cleanup.
@ivar declare: C code that declares variables for use by the computation
@ivar behavior: C code that performs the computation
@ivar cleanup: C code that cleans up things allocated or incref-ed in behavior
@ivar cleanup: C code that cleans up things allocated or incref-ed
in behavior
"""
def __init__(self, declare, behavior, cleanup, sub):
"""
Initialize a L{CodeBlock} with templatized declare, behavior and cleanup.
The sub parameter will be used in the other arguments' templates. sub
should contain a key called 'id' that maps to an identifier for this block.
The identifier will be used to determine the failure code and a label
to jump to. It should also contain a key called 'failure_var' that contains
the name of the variable that contains the error code.
Initialize a L{CodeBlock} with templatized declare, behavior
and cleanup. The sub parameter will be used in the other
arguments' templates. sub should contain a key called 'id'
that maps to an identifier for this block.
The identifier will be used to determine the failure code and
a label to jump to. It should also contain a key called
'failure_var' that contains the name of the variable that
contains the error code.
"""
self.declare = declare
self.behavior = behavior
# the dummy is because gcc throws an error when a label's right next to a closing
# brace (maybe there's an ignore flag for that...)
# we need the label even if cleanup is empty because the behavior block jumps there
# on failure
self.cleanup = ("__label_%(id)i:\n"%sub + cleanup + "\ndouble __DUMMY_%(id)i;\n"%sub) #% sub
# the dummy is because gcc throws an error when a label's
# right next to a closing brace (maybe there's an ignore flag
# for that...)
# we need the label even if cleanup is empty because the
# behavior block jumps there on failure
self.cleanup = ("__label_%(id)i:\n" % sub + cleanup +
"\ndouble __DUMMY_%(id)i;\n" % sub) # % sub
def failure_code(sub):
......@@ -102,10 +115,10 @@ def failure_code(sub):
def code_gen(blocks):
"""WRITEME
From a list of L{CodeBlock} instances, returns a string that executes them
all in sequence. eg for C{(decl1, task1, cleanup1)} and C{(decl2, task2, cleanup2)}
the returned string will be of the form::
"""WRITEME From a list of L{CodeBlock} instances, returns a string
that executes them all in sequence. eg for C{(decl1, task1,
cleanup1)} and C{(decl2, task2, cleanup2)} the returned string
will be of the form::
decl1
decl2
......@@ -181,10 +194,11 @@ def struct_gen(args, struct_builders, blocks, sub):
args_names = ", ".join(args)
args_decl = ", ".join(["PyObject* %s" % arg for arg in args])
# The following code stores the exception data in __ERROR, which is a special
# field of the struct. __ERROR is a list of length 3 that holds the type, the
# value and the traceback. After storing the error, we return the failure code
# so we know which code block failed.
# The following code stores the exception data in __ERROR, which
# is a special field of the struct. __ERROR is a list of length 3
# that holds the type, the value and the traceback. After storing
# the error, we return the failure code so we know which code
# block failed.
do_return = """
if (%(failure_var)s) {
// When there is a failure, this code puts the exception
......@@ -213,8 +227,8 @@ def struct_gen(args, struct_builders, blocks, sub):
sub = dict(sub)
sub.update(locals())
# TODO: add some error checking to make sure storage_<x> are 1-element lists
# and __ERROR is a 3-elements list.
# TODO: add some error checking to make sure storage_<x> are
# 1-element lists and __ERROR is a 3-elements list.
struct_code = """
struct %(name)s {
PyObject* __ERROR;
......@@ -260,6 +274,7 @@ def get_nothing(r, name, sub):
"""WRITEME"""
return ""
def get_c_declare(r, name, sub):
"""WRITEME"""
pre = """
......@@ -267,6 +282,7 @@ def get_c_declare(r, name, sub):
""" % locals()
return pre + r.type.c_declare(name, sub)
def get_c_init(r, name, sub):
"""WRITEME"""
pre = "" """
......@@ -275,6 +291,7 @@ def get_c_init(r, name, sub):
""" % locals()
return pre + r.type.c_init(name, sub)
def get_c_extract(r, name, sub):
"""WRITEME"""
pre = """
......@@ -283,6 +300,7 @@ def get_c_extract(r, name, sub):
""" % locals()
return pre + r.type.c_extract(name, sub)
def get_c_cleanup(r, name, sub):
"""WRITEME"""
post = """
......@@ -290,6 +308,7 @@ def get_c_cleanup(r, name, sub):
""" % locals()
return r.type.c_cleanup(name, sub) + post
def get_c_sync(r, name, sub):
"""WRITEME"""
return """
......@@ -300,11 +319,13 @@ def get_c_sync(r, name, sub):
PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s);
{Py_XDECREF(old);}
}
""" % dict(sync = r.type.c_sync(name, sub), name = name, **sub)
""" % dict(sync=r.type.c_sync(name, sub), name=name, **sub)
def apply_policy(policy, r, name, sub):
"""WRITEME
@param policy: list of functions that map a L{Variable} to a string, or a single such function
@param policy: list of functions that map a L{Variable} to a string,
or a single such function
@type r: L{Variable}
@return: C{policy[0](r) + policy[1](r) + ...}
"""
......@@ -316,18 +337,22 @@ def apply_policy(policy, r, name, sub):
return policy(r, name, sub)
def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
"""WRITEME
variable -> a Variable
policies -> a pair of tuples ((declare_policy, behavior_policy, cleanup_policy), -- at construction
(declare_policy, behavior_policy, cleanup_policy)) -- at execution
the first list will produce an element of the 'struct_builders' argument in struct_gen
the second list will produce an element of the 'blocks' argument in struct_gen
policies -> a pair of tuples ((declare_policy, behavior_policy,
cleanup_policy), -- at construction
(declare_policy, behavior_policy,
cleanup_policy)) -- at execution
the first list will produce an element of the
'struct_builders' argument in struct_gen the second
list will produce an element of the 'blocks' argument
in struct_gen
id -> the id assigned to this variable's task in the computation
symbol_table -> a dict that maps variables to variable names. It is not read
by this function but a variable name for the variable is computed and added
to the table.
symbol_table -> a dict that maps variables to variable names. It
is not read by this function but a variable name for the
variable is computed and added to the table.
sub -> dictionary for use by L{CodeBlock}.
"""
......@@ -339,17 +364,20 @@ def struct_variable_codeblocks(variable, policies, id, symbol_table, sub):
sub['fail'] = failure_code(sub)
sub['py_ptr'] = "py_%s" % name
sub['stor_ptr'] = "storage_%s" % name
# struct_declare, struct_behavior, struct_cleanup, sub)
struct_builder = CodeBlock(*[apply_policy(policy, variable, name, sub)
for policy in policies[0]]+[sub]) # struct_declare, struct_behavior, struct_cleanup, sub)
for policy in policies[0]] + [sub])
sub['id'] = id + 1
sub['fail'] = failure_code(sub)
sub['py_ptr'] = "py_%s" % name
sub['stor_ptr'] = "storage_%s" % name
# run_declare, run_behavior, run_cleanup, sub)
block = CodeBlock(*[apply_policy(policy, variable, name, sub)
for policy in policies[1]]+[sub]) # run_declare, run_behavior, run_cleanup, sub)
for policy in policies[1]] + [sub])
return struct_builder, block
class CLinker(link.Linker):
"""WRITEME
......@@ -365,11 +393,12 @@ class CLinker(link.Linker):
def __init__(self):
self.env = None
def accept(self, env, no_recycling = []):
def accept(self, env, no_recycling=[]):
"""WRITEME"""
if self.env is not None and self.env is not env:
return type(self)().accept(env, no_recycling)
#raise Exception("Cannot accept from a Linker that is already tied to another Env.")
#raise Exception("Cannot accept from a Linker that is already"
# " tied to another Env.")
self.env = env
self.fetch_variables()
self.no_recycling = no_recycling
......@@ -377,15 +406,21 @@ class CLinker(link.Linker):
def fetch_variables(self):
"""WRITEME
Fills the inputs, outputs, variables, orphans, temps and node_order fields.
Fills the inputs, outputs, variables, orphans,
temps and node_order fields.
"""
env = self.env
self.inputs = env.inputs
self.outputs = env.outputs
self.variables = graph.variables(self.inputs, self.outputs) # list(env.variables)
# list(env.variables)
self.variables = graph.variables(self.inputs, self.outputs)
# The orphans field is listified to ensure a consistent order.
self.orphans = list(r for r in self.variables if isinstance(r, graph.Value) and r not in self.inputs) #list(env.orphans.difference(self.outputs))
self.temps = list(set(self.variables).difference(self.inputs).difference(self.outputs).difference(self.orphans))
#list(env.orphans.difference(self.outputs))
self.orphans = list(r for r in self.variables
if isinstance(r, graph.Value) and
r not in self.inputs)
self.temps = list(set(self.variables).difference(
self.inputs).difference(self.outputs).difference(self.orphans))
self.consts = []
self.node_order = env.toposort()
......@@ -408,8 +443,6 @@ class CLinker(link.Linker):
no_recycling = self.no_recycling
env = self.env
self.consts = []
c_support_code_apply = []
......@@ -429,62 +462,82 @@ class CLinker(link.Linker):
failure_var = "__failure"
id = 1
sub = dict(failure_var = failure_var)
sub = dict(failure_var=failure_var)
for variable in self.variables:
# it might be possible to inline constant variables as C literals
## if getattr(variable, 'constant', False):
# policy = [[what to declare in the struct, what to do at construction, what to do at destruction],
# [what to declare in each run, what to do at the beginning of each run, what to do at the end of each run]]
# policy = [[what to declare in the struct,
# what to do at construction,
# what to do at destruction],
# [what to declare in each run,
# what to do at the beginning of each run,
# what to do at the end of each run]]
if variable in self.inputs:
# we need to extract the new inputs at each run
# they do not need to be relayed to Python, so we don't sync
# if isinstance(variable, Constant):
# raise TypeError("Inputs to CLinker cannot be Constant.", variable)
# raise TypeError("Inputs to CLinker cannot be Constant.",
# variable)
policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_extract, get_c_cleanup]]
elif variable in self.orphans:
if not isinstance(variable, graph.Value):
raise TypeError("All orphans to CLinker must be Value instances.", variable)
raise TypeError("All orphans to CLinker must be Value"
" instances.", variable)
if isinstance(variable, graph.Constant):
try:
symbol[variable] = "(" + variable.type.c_literal(variable.data) + ")"
symbol[variable] = ("(" + variable.type.c_literal(
variable.data) + ")")
self.consts.append(variable)
self.orphans.remove(variable)
continue
except (utils.MethodNotDefined, NotImplementedError):
pass
# orphans are not inputs so we'll just get fetch them when we initialize the struct and assume they stay the same
# orphans are not inputs so we'll just get fetch them
# when we initialize the struct and assume they stay
# the same
policy = [[get_c_declare, get_c_extract, get_c_cleanup],
[get_nothing, get_nothing, get_nothing]]
elif variable in self.temps:
# temps don't need to be extracted from Python, so we call c_init rather than c_extract
# they do not need to be relayed to Python, so we don't sync
# temps don't need to be extracted from Python, so we
# call c_init rather than c_extract they do not need
# to be relayed to Python, so we don't sync
if variable.type.c_is_simple() or variable in no_recycling:
policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_init, get_c_cleanup]]
else:
# it is useful for complex temps to reuse storage at each run, so we only clean up in the destructor
# it is useful for complex temps to reuse storage
# at each run, so we only clean up in the
# destructor
policy = [[get_c_declare, get_c_init, get_c_cleanup],
[get_nothing, get_nothing, get_nothing]]
elif variable in self.outputs:
# outputs don't need to be extracted from Python, so we call c_init rather than c_extract
# outputs don't need to be extracted from Python, so
# we call c_init rather than c_extract
if variable.type.c_is_simple() or variable in no_recycling:
policy = [[get_nothing, get_nothing, get_nothing],
[get_c_declare, get_c_init, (get_c_sync, get_c_cleanup)]]
[get_c_declare, get_c_init,
(get_c_sync, get_c_cleanup)]]
else:
# it is useful for complex outputs to reuse storage at each run, so we only clean up in the destructor
# it is useful for complex outputs to reuse
# storage at each run, so we only clean up in the
# destructor
policy = [[get_c_declare, get_c_init, get_c_cleanup],
[get_nothing, get_nothing, get_c_sync]]
else:
raise Exception("what the fuck")
builder, block = struct_variable_codeblocks(variable, policy, id, symbol, sub)
builder, block = struct_variable_codeblocks(variable, policy,
id, symbol, sub)
# each Variable generates two CodeBlocks, one to declare/initialize/destroy struct variables
# and the other to declare/extract/cleanup each time the function is run.
# Typically, only one of the two actually does anything (see all the possible combinations above)
# each Variable generates two CodeBlocks, one to
# declare/initialize/destroy struct variables and the
# other to declare/extract/cleanup each time the function
# is run.
# Typically, only one of the two actually does anything
# (see all the possible combinations above)
init_tasks.append((variable, 'init', id))
init_blocks.append(builder)
......@@ -496,19 +549,23 @@ class CLinker(link.Linker):
for node_num, node in enumerate(self.node_order):
# We populate sub with a mapping from the variable names specified by the op's c_var_names
# method to the actual variable names that we will use.
# We populate sub with a mapping from the variable names
# specified by the op's c_var_names method to the actual
# variable names that we will use.
## ivnames, ovnames = op.c_var_names()
sub = dict(failure_var = failure_var)
## for variable, vname in zip(op.inputs + op.outputs, ivnames + ovnames):
sub = dict(failure_var=failure_var)
## for variable, vname in zip(op.inputs + op.outputs,
## ivnames + ovnames):
## sub[vname] = symbol[variable]
name = "node_%i" % node_num
isyms, osyms = [symbol[r] for r in node.inputs], [symbol[r] for r in node.outputs]
isyms = [symbol[r] for r in node.inputs]
osyms = [symbol[r] for r in node.outputs]
# c_validate_update is deprecated
if hasattr(node.op, 'c_validate_update'):
raise Exception("c_validate_update is deprecated, move contents to c_code", node.op)
raise Exception("c_validate_update is deprecated,"
" move contents to c_code", node.op)
# Make the CodeBlock for c_code
sub['id'] = id
......@@ -517,20 +574,23 @@ class CLinker(link.Linker):
op = node.op
# type-specific support code
try:
c_support_code_apply.append(op.c_support_code_apply(node, name))
c_support_code_apply.append(op.c_support_code_apply(node,
name))
except utils.MethodNotDefined:
pass
else:
# The following will be executed if the "try" block succeeds
assert isinstance(c_support_code_apply[-1], basestring), (
str(node.op)+" didn't returned a string for c_support_code_apply")
str(node.op) +
" didn't returned a string for c_support_code_apply")
# emit c_code
try:
behavior = op.c_code(node, name, isyms, osyms, sub)
except utils.MethodNotDefined:
raise NotImplementedError("%s cannot produce C code" % op)
assert isinstance(behavior, basestring), str(node.op)+" didn't returned a string for c_code"
assert isinstance(behavior, basestring), (
str(node.op) + " didn't returned a string for c_code")
try:
cleanup = op.c_code_cleanup(node, name, isyms, osyms, sub)
......@@ -543,18 +603,24 @@ class CLinker(link.Linker):
tasks.append((node, 'code', id))
id += 1
# List of arg names for use in struct_gen. Note the call to uniq: duplicate inputs
# must only be passed once because they are mapped to the same name.
# Duplicates are defined by (a is b), rather than (a==b) since Constant instances can
# List of arg names for use in struct_gen. Note the call to
# uniq: duplicate inputs must only be passed once because they
# are mapped to the same name. Duplicates are defined by (a
# is b), rather than (a==b) since Constant instances can
# compare equal to equivalent Constant instances.
args = []
args += ["storage_%s" % symbol[variable] for variable in utils.uniq(self.inputs + self.outputs + self.orphans)]
args += ["storage_%s" % symbol[variable] for variable
in utils.uniq(self.inputs + self.outputs + self.orphans)]
struct_code = struct_gen(args, init_blocks, blocks, dict(failure_var = failure_var, name = "<<<<NAME>>>>"))
struct_code = struct_gen(args, init_blocks, blocks,
dict(failure_var=failure_var,
name="<<<<NAME>>>>"))
# TODO: still needed? We do not use weave anymore.
# The hash calculated on the code identifies it so weave can cache properly.
# (the hash has to be used outside of the support code because weave does not consider changes in the support code)
# The hash calculated on the code identifies it so weave can
# cache properly. (the hash has to be used outside of the
# support code because weave does not consider changes in the
# support code)
hash = hash_from_code(struct_code)
struct_name = '__struct_compiled_op_%s' % hash
......@@ -570,7 +636,7 @@ class CLinker(link.Linker):
self.init_tasks = init_tasks
self.blocks = blocks
self.tasks = tasks
all = self.inputs + self.outputs + self.orphans
all_info = self.inputs + self.outputs + self.orphans
self.c_support_code_apply = c_support_code_apply
if (self.init_tasks, self.tasks) != self.get_init_tasks():
......@@ -582,7 +648,8 @@ class CLinker(link.Linker):
# List of indices that should be ignored when passing the arguments
# (basically, everything that the previous call to uniq eliminated)
self.dupidx = [i for i, x in enumerate(all) if all.count(x) > 1 and all.index(x) != i]
self.dupidx = [i for i, x in enumerate(all_info)
if all_info.count(x) > 1 and all_info.index(x) != i]
return self.struct_code
def support_code(self):
......@@ -595,9 +662,12 @@ class CLinker(link.Linker):
"""
ret = []
# generic support code
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret.append(x.c_support_code())
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret.append(x.c_support_code())
except utils.MethodNotDefined:
pass
return ret
def compile_args(self):
......@@ -608,33 +678,43 @@ class CLinker(link.Linker):
This might contain duplicates.
"""
ret = ["-O3"]
# this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable some of them. Putting -ffast-math would make it disable all other parameter at the same time.
# this is the param the -ffast-math activate. I put the explicitly as
# FillMissing must disable some of them. Putting -ffast-math would
# make it disable all other parameter at the same time.
ret += ["-fno-math-errno",
#"-funsafe-math-optimizations",
#"-fno-signaling-nans",
#"-fcx-limited-range",
#"-fno-rounding-math",
#"-ffinite-math-only",
"-Wno-unused-label",#the current code generate label event if they are not used. Could use gcc attribute for those label only
"-Wno-unused-variable",#idem as the precedent
"-Wno-write-strings",#generated by our code generator...
#the current code generate label event if they are not used.
#Could use gcc attribute for those label only
"-Wno-unused-label",
"-Wno-unused-variable", # idem as the precedent
"-Wno-write-strings", # generated by our code generator...
]
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret += x.c_compile_args()
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret += x.c_compile_args()
except utils.MethodNotDefined:
pass
c_compiler = self.c_compiler()
ret += c_compiler.compile_args()
ret=list(set(ret))#to remove duplicate
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
ret = list(set(ret)) # to remove duplicate
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
for i in x.c_no_compile_args():
try:
ret.remove(i)
except ValueError:
pass# in case the value is not there
except utils.MethodNotDefined: pass
pass # in case the value is not there
except utils.MethodNotDefined:
pass
return ret
def headers(self):
......@@ -645,14 +725,18 @@ class CLinker(link.Linker):
The return value will not contain duplicates.
"""
ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret += x.c_headers()
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret += x.c_headers()
except utils.MethodNotDefined:
pass
return list(set(ret))
def c_compiler(self):
c_compiler = None
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
if hasattr(x, 'c_compiler'):
x_compiler = x.c_compiler()
else:
......@@ -662,11 +746,13 @@ class CLinker(link.Linker):
c_compiler = x_compiler
else:
if x_compiler and (x_compiler != c_compiler):
raise Exception('Nodes have requested specific different compilers',
(c_compiler, x_compiler))
raise Exception('Nodes have requested specific'
' different compilers',
(c_compiler, x_compiler))
if (c_compiler is None):
return cmodule.GCC_compiler
else: return c_compiler
else:
return c_compiler
def header_dirs(self):
"""WRITEME
......@@ -676,9 +762,12 @@ class CLinker(link.Linker):
The return value will not contain duplicates.
"""
ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret += x.c_header_dirs()
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret += x.c_header_dirs()
except utils.MethodNotDefined:
pass
return list(set(ret))
def libraries(self):
......@@ -689,9 +778,12 @@ class CLinker(link.Linker):
The return value will not contain duplicates.
"""
ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret += x.c_libraries()
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret += x.c_libraries()
except utils.MethodNotDefined:
pass
return list(set(ret))
def lib_dirs(self):
......@@ -702,12 +794,16 @@ class CLinker(link.Linker):
The return value will not contain duplicates.
"""
ret = []
for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
try: ret += x.c_lib_dirs()
except utils.MethodNotDefined: pass
for x in [y.type for y in self.variables] + [
y.op for y in self.node_order]:
try:
ret += x.c_lib_dirs()
except utils.MethodNotDefined:
pass
return list(set(ret))
def __compile__(self, input_storage = None, output_storage = None, keep_lock=False):
def __compile__(self, input_storage=None,
output_storage=None, keep_lock=False):
"""WRITEME
Compiles this linker's env.
......@@ -737,33 +833,37 @@ class CLinker(link.Linker):
input_storage,
output_storage,
keep_lock=keep_lock)
return thunk, \
[link.Container(input, storage) for input, storage in izip(self.env.inputs, input_storage)], \
[link.Container(output, storage, True) for output, storage in izip(self.env.outputs, output_storage)], \
error_storage
return (thunk,
[link.Container(input, storage) for input, storage in
izip(self.env.inputs, input_storage)],
[link.Container(output, storage, True) for output, storage in
izip(self.env.outputs, output_storage)],
error_storage)
def get_init_tasks(self):
init_tasks = []
tasks = []
id=1
id = 1
for v in self.variables:
if v in self.consts:
continue
if v in self.orphans and isinstance(v, graph.Constant):
try:
v.type.c_literal(v.data) #constant will be inlined, no need to get
# constant will be inlined, no need to get
v.type.c_literal(v.data)
continue
except (utils.MethodNotDefined, NotImplementedError):
pass
init_tasks.append((v, 'init', id))
tasks.append((v, 'get', id+1))
tasks.append((v, 'get', id + 1))
id += 2
for node in self.node_order:
tasks.append((node, 'code', id))
id += 1
return init_tasks, tasks
def make_thunk(self, input_storage = None, output_storage = None, keep_lock=False):
def make_thunk(self, input_storage=None, output_storage=None,
keep_lock=False):
"""WRITEME
Compiles this linker's env and returns a function to perform the
computations, as well as lists of storage cells for both the
......@@ -787,16 +887,18 @@ class CLinker(link.Linker):
first_output = ostor[0].data
"""
init_tasks, tasks = self.get_init_tasks()
cthunk, in_storage, out_storage, error_storage = self.__compile__(input_storage, output_storage,
keep_lock=keep_lock)
res = _CThunk(cthunk, init_tasks, tasks, error_storage), in_storage, out_storage
return res
cthunk, in_storage, out_storage, error_storage = self.__compile__(
input_storage, output_storage,
keep_lock=keep_lock)
res = _CThunk(cthunk, init_tasks, tasks, error_storage)
return res, in_storage, out_storage
def cmodule_key(self):
"""Return a complete hashable signature of the module we compiled.
This function must have the property that no two programs that compute different things
yield the same key.
This function must have the property that no two programs that
compute different things yield the same key.
The key returned by this function is of the form (version, signature)
The signature has the following form:
......@@ -817,8 +919,9 @@ class CLinker(link.Linker):
It is followed by elements for every node in the
topological ordering of `self.env`.
If the Op of any Apply in the Env does not have c_code_cache_ok()==True, then this
function raises a KeyError exception.
If the Op of any Apply in the Env does not have
c_code_cache_ok()==True, then this function raises a KeyError
exception.
Input Signature
---------------
......@@ -828,10 +931,13 @@ class CLinker(link.Linker):
type of the node input, and the nature of that input in the
graph.
The nature of a typical variable is encoded by integer pairs ``((a,b),c)``:
``a`` is the topological position of the input's owner (-1 for graph inputs),
The nature of a typical variable is encoded by integer pairs
``((a,b),c)``:
``a`` is the topological position of the input's owner
(-1 for graph inputs),
``b`` is the index of the variable in the owner's output list.
``c`` is a flag indicating whether the variable is in the no_recycling set.
``c`` is a flag indicating whether the variable is in the
no_recycling set.
If a variable is also a graph output, then its position in the
outputs list is also bundled with this tuple (after the b).
......@@ -865,6 +971,7 @@ class CLinker(link.Linker):
libraries=self.libraries(),
header_dirs=self.header_dirs(),
)
@staticmethod
def cmodule_key_(env, no_recycling, compile_args=[], libraries=[],
header_dirs=[], insert_config_md5=True):
......@@ -876,13 +983,14 @@ class CLinker(link.Linker):
#set of variables that have been computed by nodes we have
# seen 'so far' in the loop below
env_computed_set = set()
env_inputs_dict = dict((i, (-1, pos)) for pos, i in enumerate(env.inputs))
env_inputs_dict = dict((i, (-1, pos)) for pos, i in
enumerate(env.inputs))
constant_ids = dict()
op_pos = {} # Apply -> topological position
op_pos = {} # Apply -> topological position
# First we put the header, compile_args, library names and config md5
# into the signature.
sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
if compile_args is not None:
# We must sort it as the order from a set is not guaranteed.
# In particular, 2 sets with the same content can give different
......@@ -912,6 +1020,7 @@ class CLinker(link.Linker):
sig.append('md5: <omitted>')
error_on_play = [False]
def in_sig(i, topological_pos, i_idx):
# assert that every input to every node is one of'
# - an env input
......@@ -920,7 +1029,7 @@ class CLinker(link.Linker):
# It is important that a variable (i)
# yield a 'position' that reflects its role in code_gen()
if isinstance(i, graph.Constant): #orphans
if isinstance(i, graph.Constant): # orphans
if id(i) not in constant_ids:
isig = (i.signature(), topological_pos, i_idx)
# If the Theano constant provides a strong hash
......@@ -933,7 +1042,8 @@ class CLinker(link.Linker):
isig = (isig[0].theano_hash(), topological_pos, i_idx)
try:
hash(isig)
except Exception: #generic constants don't have a hashable signature
except Exception:
#generic constants don't have a hashable signature
error_on_play[0] = True
return None
constant_ids[id(i)] = isig
......@@ -941,20 +1051,22 @@ class CLinker(link.Linker):
isig = constant_ids[id(i)]
#print 'SIGNATURE', i.signature()
#return i.signature()
elif i in env_inputs_dict: #inputs
elif i in env_inputs_dict: # inputs
isig = env_inputs_dict[i]
else:
if i.owner is None:
assert all( all(out is not None for out in o.outputs) for o in order)
assert all( input.owner is None for input in env.inputs)
raise Exception('what is this?', (i, type(i), i.clients, env))
assert all(all(out is not None for out in o.outputs)
for o in order)
assert all(input.owner is None for input in env.inputs)
raise Exception('what is this?', (i, type(i), i.clients,
env))
if i in env.outputs:
isig = (op_pos[i.owner], # outputs
isig = (op_pos[i.owner], # outputs
i.owner.outputs.index(i),
env.outputs.index(i))
else:
isig = (op_pos[i.owner], i.owner.outputs.index(i)) # temps
isig = (op_pos[i.owner], i.owner.outputs.index(i)) # temps
return (isig, i in no_recycling)
version = []
......@@ -973,7 +1085,7 @@ class CLinker(link.Linker):
sig.append((
node.op,
tuple((i.type, in_sig(i, node_pos, ipos))
for ipos,i in enumerate(node.inputs)),
for ipos, i in enumerate(node.inputs)),
tuple(o in no_recycling for o in node.outputs)))
if error_on_play[0]:
......@@ -1026,7 +1138,9 @@ class CLinker(link.Linker):
if compiler_name == 'NVCC_compiler' and config.lib.amdlibm:
# This lib does not work correctly with nvcc in device code.
# and newer version of g++ as 4.5.1.
# example of errors: "/usr/lib/gcc/x86_64-redhat-linux/4.5.1/include/mmintrin.h(49): error: identifier "__builtin_ia32_emms" is undefined"
# example of errors: "/usr/lib/gcc/x86_64-redhat-linux/4.5.1/
# include/mmintrin.h(49): error: identifier
# "__builtin_ia32_emms" is undefined"
if '<amdlibm.h>' in mod.includes:
mod.includes.remove('<amdlibm.h>')
......@@ -1057,7 +1171,8 @@ class CLinker(link.Linker):
yield module
def build_dynamic_module(self):
"""Return a cmodule.DynamicModule instance full of the code for our env.
"""Return a cmodule.DynamicModule instance full of the code
for our env.
"""
self.code_gen()
module_name = self.hash
......@@ -1065,13 +1180,16 @@ class CLinker(link.Linker):
mod = cmodule.DynamicModule(module_name)
# The code of instantiate
code = self.instantiate_code(1+len(self.args)) #the 1 is for error_storage
instantiate = cmodule.ExtFunction('instantiate', code, method=cmodule.METH_VARARGS)
# the 1 is for error_storage
code = self.instantiate_code(1 + len(self.args))
instantiate = cmodule.ExtFunction('instantiate', code,
method=cmodule.METH_VARARGS)
#['error_storage'] + argnames,
#local_dict = d,
#global_dict = {})
# Static methods that can run and destroy the struct built by instantiate.
# Static methods that can run and destroy the struct built by
# instantiate.
static = """
int %(struct_name)s_executor(%(struct_name)s* self) {
return self->run();
......@@ -1086,7 +1204,7 @@ class CLinker(link.Linker):
//printf("done cleanup\\n");
//fflush(stdout);
}
""" % dict(struct_name = self.struct_name)
""" % dict(struct_name=self.struct_name)
# We add all the support code, compile args, headers and libs we need.
for support_code in self.support_code() + self.c_support_code_apply:
......@@ -1099,8 +1217,8 @@ class CLinker(link.Linker):
return mod
def cthunk_factory(self, error_storage, in_storage, out_storage, keep_lock=False):
def cthunk_factory(self, error_storage, in_storage, out_storage,
keep_lock=False):
"""WRITEME
error_storage -> list of length 3
in_storage -> list of lists of length 1, one per input
......@@ -1120,18 +1238,22 @@ class CLinker(link.Linker):
# If we can't get a key, then forget the cache mechanism.
module = self.compile_cmodule()
else:
module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)
module = get_module_cache().module_from_key(
key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)
vars = self.inputs + self.outputs + self.orphans
# List of indices that should be ignored when passing the arguments
# (basically, everything that the previous call to uniq eliminated)
dupidx = [i for i, x in enumerate(vars) if vars.count(x) > 1 and vars.index(x) != i]
dupidx = [i for i, x in enumerate(vars)
if vars.count(x) > 1 and vars.index(x) != i]
out_storage = [x for i, x in enumerate(out_storage) if (i+len(in_storage)) not in dupidx]
out_storage = [x for i, x in enumerate(out_storage)
if (i + len(in_storage)) not in dupidx]
in_storage = [x for i, x in enumerate(in_storage) if i not in dupidx]
orphd = [[orphan.data] for orphan in self.orphans]
ret = module.instantiate(error_storage, *(in_storage + out_storage + orphd))
ret = module.instantiate(error_storage, *(in_storage + out_storage +
orphd))
return ret
......@@ -1150,6 +1272,7 @@ class CLinker(link.Linker):
print >> code, " return thunk; }"
return code.getvalue()
class _CThunk(object):
"""
A thunk with a C implementation
......@@ -1181,7 +1304,8 @@ class _CThunk(object):
n = len(self.init_tasks)
# note that the failure code is distributed in two lists
if failure_code < 2 * n:
return [self.init_tasks, self.tasks][failure_code % 2][failure_code/2]
return [self.init_tasks, self.tasks][
failure_code % 2][failure_code / 2]
else:
return self.tasks[failure_code - n]
......@@ -1199,19 +1323,16 @@ class _CThunk(object):
exc_value = exc_type(_exc_value, task, task.outputs)
else:
exc_value = exc_type(_exc_value, task)
exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared
# this can be used to retrieve the location the Op was declared
exc_value.__thunk_trace__ = trace
except Exception:
print >> sys.stderr, 'ERROR retrieving error_storage', self.error_storage
print >> sys.stderr, 'ERROR retrieving error_storage',
print >> sys.stderr, self.error_storage
raise
raise exc_type, exc_value, exc_trace
class OpWiseCLinker(link.LocalLinker):
"""WRITEME
Uses CLinker on the individual Ops that comprise an env and loops
......@@ -1227,27 +1348,30 @@ class OpWiseCLinker(link.LocalLinker):
If a Variable is in no_recycling, CLinker will clear the output storage
associated to it prior to computation (to avoid reusing it).
:note: This is in a sense the 'default' linker for Theano. The overhead of using the
OpWiseCLinker as compared with the CLinker is only noticeable for graphs of very small
tensors (such as 20 elements or less)
:note: This is in a sense the 'default' linker for Theano. The
overhead of using the OpWiseCLinker as compared with the CLinker
is only noticeable for graphs of very small tensors (such as 20
elements or less)
"""
__cache__ = {}
def __init__(self,
fallback_on_perform = True,
allow_gc = True,
nice_errors = True):
fallback_on_perform=True,
allow_gc=True,
nice_errors=True):
self.env = None
self.fallback_on_perform = fallback_on_perform
self.nice_errors = nice_errors
self.allow_gc = allow_gc
def accept(self, env, no_recycling = []):
def accept(self, env, no_recycling=[]):
if self.env is not None and self.env is not env:
return type(self)(self.fallback_on_perform).accept(env, no_recycling)
#raise Exception("Cannot accept from a Linker that is already tied to another Env.")
return type(self)(self.fallback_on_perform).accept(env,
no_recycling)
#raise Exception("Cannot accept from a Linker that is
#already tied to another Env.")
self.env = env
self.no_recycling = no_recycling
return self
......
......@@ -5,14 +5,12 @@ import logging
import sys
import time
import link
import traceback
from theano.gof.python25 import all
import theano
config = theano.config
from theano.configparser import config, AddConfigVar, BoolParam
from theano import config
logger = logging.getLogger(__name__)
......@@ -33,13 +31,13 @@ class VM(object):
number of times thunks[i] was called in the course of computations
performed by call_with_timers().
call_times - list of floats, one for each thunk. call_times[i] is the amount
of runtime spent on thunks[i] in the course of computations performed by
call_with_timers().
call_times - list of floats, one for each thunk. call_times[i] is
the amount of runtime spent on thunks[i] in the course of
computations performed by call_with_timers().
need_update_inputs - bool. True indicates that Function.__call__ must
implement the feedback from output storage to input storage. False means
it *must not* repeat that feedback.
need_update_inputs - bool. True indicates that Function.__call__
must implement the feedback from output storage to input
storage. False means it *must not* repeat that feedback.
"""
def __init__(self, nodes, thunks, pre_call_clear):
......@@ -58,8 +56,8 @@ class VM(object):
self.nodes = nodes
self.thunks = thunks
self.pre_call_clear = pre_call_clear
self.call_counts = [0]*len(nodes)
self.call_times = [0]*len(nodes)
self.call_counts = [0] * len(nodes)
self.call_times = [0] * len(nodes)
self.time_thunks = False
# This variable (self.need_update_inputs) is overshadowed by
......@@ -88,14 +86,15 @@ class VM(object):
def update_profile(self, profile):
# accumulate into the profile object
for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts):
profile.apply_time.setdefault(node,0.0)
for node, thunk, t, c in zip(self.nodes, self.thunks,
self.call_times, self.call_counts):
profile.apply_time.setdefault(node, 0.0)
profile.apply_time[node] += t
profile.apply_callcount.setdefault(node,0)
profile.apply_callcount.setdefault(node, 0)
profile.apply_callcount[node] += c
profile.apply_cimpl[node] = hasattr(thunk,'cthunk')
profile.apply_cimpl[node] = hasattr(thunk, 'cthunk')
# clear the timer info out of the buffers
for i in xrange(len(self.call_times)):
......@@ -113,7 +112,8 @@ class Loop(VM):
for cont in self.pre_call_clear:
cont[0] = None
try:
for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)):
for i, (thunk, node) in enumerate(zip(self.thunks,
self.nodes)):
t0 = time.time()
thunk()
t1 = time.time()
......@@ -141,13 +141,16 @@ class LoopGC(VM):
self.post_thunk_clear = post_thunk_clear
if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
raise ValueError()
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
cont[0] = None
try:
i = 0
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
for thunk, node, old_storage in zip(self.thunks,
self.nodes,
self.post_thunk_clear):
t0 = time.time()
thunk()
t1 = time.time()
......@@ -162,7 +165,8 @@ class LoopGC(VM):
for cont in self.pre_call_clear:
cont[0] = None
try:
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
for thunk, node, old_storage in zip(self.thunks, self.nodes,
self.post_thunk_clear):
thunk()
for old_s in old_storage:
old_s[0] = None
......@@ -200,8 +204,8 @@ class Stack(VM):
for i, node in enumerate(self.nodes):
node_idx[node] = i
self.apply_time[node] = 0
self.outputs_size[node] = []
self.apply_time[node] = 0
self.outputs_size[node] = []
node.destroy_dependencies = []
if node in ords:
for prereq in ords[node]:
......@@ -217,9 +221,9 @@ class Stack(VM):
if cl[0] is not 'output':
ls += cl[0].outputs
dependencies[k] += ls
if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16}
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4,
"t64": 8, "128": 16}
atexit.register(self.atexit_print_all)
def run_thunk_of_node(self, node):
......@@ -257,11 +261,13 @@ class Stack(VM):
last_apply_stack_len = -1
ls = []
while apply_stack:
# Make sure something happened last time round.
# This is just a safety check to make sure the op is written correctly
# apply_stack should either decrease in length by one (a thunk successfully applied), or
# increase in length (added dependencies over and above the original).
# NB: this doesn't catch cycles (would be too expensive/slow), just stalls.
# Make sure something happened last time round. This is
# just a safety check to make sure the op is written
# correctly apply_stack should either decrease in length
# by one (a thunk successfully applied), or increase in
# length (added dependencies over and above the original).
# NB: this doesn't catch cycles (would be too expensive/slow),
# just stalls.
apply_stack_len = len(apply_stack)
assert apply_stack_len != last_apply_stack_len
last_apply_stack_len = apply_stack_len
......@@ -289,8 +295,8 @@ class Stack(VM):
if not thunks[self.node_idx[current_apply]].lazy:
# Check if all inputs are in place
# If so compute thunk and remove it from the apply_stack
# If not leave it in, and add to the apply_stack those that will
# produce you those inputs
# If not leave it in, and add to the apply_stack those
# that will produce you those inputs
if computed_ins and not computed_outs:
try:
......@@ -302,22 +308,26 @@ class Stack(VM):
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
size = []
for (idx,o) in enumerate(
thunks[self.node_idx[current_apply]].outputs):
if not hasattr(o[0],'size'):
for (idx, o) in enumerate(
thunks[self.node_idx[
current_apply]].outputs):
if not hasattr(o[0], 'size'):
size.append(-1)
continue
s=o[0].size
s = o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-3:]
s *= self.memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
# KeyError here: couldn't determine
# the dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s)
self.outputs_size[current_apply] = size
except Exception:
raise_with_op(current_apply)
for o in current_apply.outputs:
compute_map[o][0] = 1
# Garbage Collection -> check if anybody else uses this input
# Garbage Collection -> check if anybody else uses
# this input
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner
......@@ -332,8 +342,11 @@ class Stack(VM):
elif not computed_ins:
apply_stack.append(current_apply)
apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner)
apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner)
apply_stack.extend(inp.owner for inp
in current_apply.inputs if inp.owner)
apply_stack.extend(inp.owner for inp
in current_apply.destroy_dependencies
if inp.owner)
elif not computed_outs:
# Try and run it to see if it works
......@@ -346,22 +359,26 @@ class Stack(VM):
if requires:
for r in requires:
# We are not done with this op ..
# so we added back and see to get the inputs we are missing
# We are not done with this op .. so we added
# back and see to get the inputs we are
# missing
apply_stack.append(current_apply)
if current_apply.inputs[r].owner:
apply_stack.append(current_apply.inputs[r].owner)
else:
if config.profile:
size = []
for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs):
for (idx, o) in enumerate(thunks[
self.node_idx[current_apply]].outputs):
if not hasattr(o[0], 'size'):
size.append(-1)
continue
s=o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
s *= self.memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
# KeyError here: couldn't determine the
# dtype memory size
s *= self.memory_size_map[dtype2]
size.append(s)
self.outputs_size[current_apply] = size
if self.allow_gc:
......@@ -379,6 +396,7 @@ class Stack(VM):
try:
import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
......@@ -394,9 +412,9 @@ class VM_Linker(link.LocalLinker):
def __init__(self, allow_gc=True, use_cloop=False, callback=None):
"""
allow_gc - force the virtual machine to clean up unnecessary references,
in order to allow garbage collection on intermediate values during
computation of a function.
allow_gc - force the virtual machine to clean up unnecessary
references, in order to allow garbage collection on
intermediate values during computation of a function.
use_cloop - use the C-based virtual machine if possible
......@@ -411,9 +429,10 @@ class VM_Linker(link.LocalLinker):
self.callback = callback
self.updated_vars = {}
def accept(self, env, no_recycling = []):
def accept(self, env, no_recycling=[]):
"""
:param env: a PerformLinker can have accepted one Env instance at a time.
:param env: a PerformLinker can have accepted one Env instance
at a time.
:param no_recycling: WRITEME
......@@ -464,9 +483,9 @@ class VM_Linker(link.LocalLinker):
nodes_idx_inv = {}
vars_idx_inv = {}
for (node,i) in nodes_idx.items():
for (node, i) in nodes_idx.items():
nodes_idx_inv[i] = node
for (var,i) in vars_idx.items():
for (var, i) in vars_idx.items():
vars_idx_inv[i] = var
# put storage_map and compute_map into a int-based scheme
......@@ -496,8 +515,8 @@ class VM_Linker(link.LocalLinker):
base_input_output_list.extend(outputs_idx)
# build the var owner array
var_owner = [None]*len(vars_idx)
for (var,i) in vars_idx.items():
var_owner = [None] * len(vars_idx)
for (var, i) in vars_idx.items():
if var.owner:
var_owner[i] = nodes_idx[var.owner]
......@@ -511,18 +530,18 @@ class VM_Linker(link.LocalLinker):
for i, node in enumerate(nodes):
node_output_size.append(0)
prereq_var_idxs = []
for prereq_node in ords.get(node,[]):
for prereq_node in ords.get(node, []):
prereq_var_idxs.extend(
[vars_idx[v] for v in prereq_node.outputs])
prereq_var_idxs = list(set(prereq_var_idxs))
prereq_var_idxs.sort() # TODO: why sort?
prereq_var_idxs.sort() # TODO: why sort?
node_prereqs.append(prereq_var_idxs)
update_storage = []
for (ivar, ovar) in updated_vars.items():
if ivar != ovar:
update_storage.append(vars_idx[ivar]) #dst
update_storage.append(vars_idx[ovar]) #src
update_storage.append(vars_idx[ivar]) # dst
update_storage.append(vars_idx[ovar]) # src
c0 = sys.getrefcount(node_n_inputs)
vm = CVM(
......@@ -530,8 +549,8 @@ class VM_Linker(link.LocalLinker):
thunks,
pre_call_clear,
allow_gc=self.allow_gc,
call_counts=[0]*len(nodes),
call_times=[0.0]*len(nodes),
call_counts=[0] * len(nodes),
call_times=[0.0] * len(nodes),
compute_map_list=compute_map_list,
storage_map_list=storage_map_list,
base_input_output_list=base_input_output_list,
......@@ -569,7 +588,7 @@ class VM_Linker(link.LocalLinker):
)
return vm
def make_all(self, profiler = None, input_storage = None,
def make_all(self, profiler=None, input_storage=None,
output_storage = None,
):
env = self.env
......@@ -617,4 +636,3 @@ class VM_Linker(link.LocalLinker):
for output, storage in zip(env.outputs, output_storage)],
thunks,
order)
......@@ -346,7 +346,9 @@ def handle_shared_float32(tf):
theano.compile.shared_constructor(float32_shared_constructor)
else:
raise NotImplementedError('removing our handler')
theano.compile.shared_constructor(float32_shared_constructor, True)
assert (float32_shared_constructor not in
theano.compile.shared.constructors)
# We can't test the driver during import here as this cause circular
# import dependency. So we also test it in the file theano/__init__.py
......
from theano.sparse.basic import * # To facilitate later merge into sparse module
from theano.sparse.basic import _is_sparse, _is_sparse_variable, \
_is_dense_variable, _is_sparse, _is_dense, _kmap_eq, _kmap_hash
from theano.sparse.basic import (
_is_sparse, _is_sparse_variable, _is_dense_variable,
_is_sparse, _is_dense, _kmap_eq, _kmap_hash)
class Cast(gof.op.Op):
def __init__(self, out_type):
self.out_type = out_type
def __eq__(self, other):
return (type(self) == type(other)) and self.out_type == other.out_type
def __hash__(self):
return hash(type(self)) ^ hash(self.out_type)
def make_node(self, x):
x = as_sparse_variable(x)
return gof.Apply(self, [x],
[SparseType(dtype=self.out_type, format=x.format).make_variable()])
def perform(self, node, (x, ), (out, )):
assert _is_sparse(x)
out[0] = x
......@@ -20,95 +26,118 @@ class Cast(gof.op.Op):
fcast = Cast('float32')
dcast = Cast('float64')
class Poisson(gof.op.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, x):
x = as_sparse_variable(x)
return gof.Apply(self, [x], [x.type()])
def perform(self, node, (x, ), (out, )):
assert _is_sparse(x)
out[0] = x.copy()
out[0].data = numpy.asarray(numpy.random.poisson(out[0].data), dtype=x.dtype)
out[0].data = numpy.asarray(numpy.random.poisson(out[0].data),
dtype=x.dtype)
out[0].eliminate_zeros()
poisson = Poisson()
class Multinomial(gof.op.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, n, p):
n = tensor.as_tensor_variable(n)
p = as_sparse_variable(p)
return gof.Apply(self, [n, p], [p.type()])
def perform(self, node, (n, p), (out, )):
assert _is_sparse(p)
if p.format != 'csr':
raise NotImplemented()
out[0] = p.copy()
for i in xrange(p.shape[0]):
k, l = p.indptr[i], p.indptr[i+1]
k, l = p.indptr[i], p.indptr[i + 1]
out[0].data[k:l] = numpy.random.multinomial(n[i], p.data[k:l])
multinomial = Multinomial()
class EliminateZeros(gof.op.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, x):
x = as_sparse_variable(x)
return gof.Apply(self, [x], [x.type()])
def perform(self, node, (x, ), (out, )):
assert _is_sparse(x)
out[0] = x.copy()
out[0].eliminate_zeros()
eliminate_zeros = EliminateZeros()
class Sum(gof.op.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, x, a):
x = as_sparse_variable(x)
a = tensor.as_tensor_variable(a)
return gof.Apply(self, [x, a], [tensor.TensorType(dtype = x.type.dtype,
broadcastable = (False,)).make_variable()])
return gof.Apply(self, [x, a], [tensor.TensorType(dtype=x.type.dtype,
broadcastable=(False,)).make_variable()])
def perform(self, node, (x, a), (out, )):
assert _is_sparse(x)
out[0] = numpy.asarray(x.sum(a), dtype=x.dtype).flatten()
sum = Sum()
class Binomial(gof.op.Op):
def __init__(self, format, dtype):
self.format = format
self.dtype = dtype
def __eq__(self, other):
return (type(self) == type(other)) and self.format == other.format and \
self.dtype == other.dtype
return ((type(self) == type(other)) and
self.format == other.format and
self.dtype == other.dtype)
def __hash__(self):
return hash(type(self)) ^ hash(self.format) ^ hash(self.dtype)
def make_node(self, n, p, shape):
n = tensor.as_tensor_variable(n)
p = tensor.as_tensor_variable(p)
shape = tensor.as_tensor_variable(shape)
return gof.Apply(self, [n, p, shape], [SparseType(dtype = self.dtype,
format = self.format).make_variable()])
return gof.Apply(self, [n, p, shape], [SparseType(dtype=self.dtype,
format=self.format).make_variable()])
def perform(self, node, (n, p, shape, ), (out, )):
N = n * p * shape[0] * shape[1]
data = numpy.ones(N, dtype=self.dtype)
row = numpy.random.randint(0, shape[0], N)
col = numpy.random.randint(0, shape[1], N)
res = scipy.sparse.coo_matrix((data, (row, col)), shape=shape)
out[0] = getattr(res, 'to' + self.format)()
out[0].data = numpy.ones_like(out[0].data)
csr_fbinomial = Binomial('csr', 'float32')
......@@ -116,16 +145,17 @@ csc_fbinomial = Binomial('csc', 'float32')
csr_dbinomial = Binomial('csr', 'float64')
csc_dbinomial = Binomial('csc', 'float64')
def structured_sigmoid(x):
"""
Element-wise sigmoid function only to the non-zero elements.
"""
x = as_sparse_variable(x)
x_data, x_ind, x_ptr, x_shape = csm_properties(x)
x_data = tensor.nnet.sigmoid(x_data)
return CSR(x_data, x_ind, x_ptr, x_shape)
......@@ -134,11 +164,11 @@ def structured_exp(x):
Element-wise exponential function to the non-zero elements.
"""
x = as_sparse_variable(x)
x_data, x_ind, x_ptr, x_shape = csm_properties(x)
x_data = tensor.exp(x_data)
return CSR(x_data, x_ind, x_ptr, x_shape)
......@@ -162,13 +192,13 @@ def structured_minimum(x, y):
Element-wise minimum function only to non-zero elements.
"""
x = as_sparse_variable(x)
y = tensor.as_tensor_variable(y)
x_data, x_ind, x_ptr, x_shape = csm_properties(x)
x_data = tensor.minimum(x_data, y)
return CSR(x_data, x_ind, x_ptr, x_shape)
......@@ -179,24 +209,28 @@ class StructuredAddSV(gof.op.Op):
matrix.'''
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, x, y):
x = as_sparse_variable(x)
y = tensor.as_tensor_variable(y)
assert y.type.ndim == 1
if x.type.dtype != y.type.dtype:
raise NotImplementedError()
return gof.Apply(self,
[x, y],
[SparseType(dtype = x.type.dtype,
format = x.type.format).make_variable()])
[SparseType(dtype=x.type.dtype,
format=x.type.format).make_variable()])
def perform(self, node, (x, y), (out, )):
assert _is_sparse(x) and not _is_sparse(y)
assert x.shape[1] == y.shape[0]
out[0] = x.__class__(x + (x.toarray() != 0) * y)
def grad(self, (x, y), (gz,)):
assert _is_sparse_variable(x) and _is_sparse_variable(y)
assert _is_sparse_variable(gz)
......@@ -207,14 +241,18 @@ structured_add_s_v = StructuredAddSV()
class StrucutedAddSVCSR(gof.Op):
def __eq__(self, other):
return (type(self) == type(other))
def __hash__(self):
return hash(type(self))
def make_node(self, a_data, a_indices, a_indptr, b):
assert b.type.ndim == 1
return gof.Apply(self, [a_data, a_indices, a_indptr, b],
[tensor.tensor(b.dtype, (False,))])
def c_code(self, node, name, (_data, _indices, _indptr, _b,), (_zout, ), sub):
def c_code(self, node, name, inputs, outputs, sub):
_data, _indices, _indptr, _b, = inputs
_zout, = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for a')
if node.inputs[3].type.dtype in ('complex64', 'complex128'):
......@@ -272,98 +310,105 @@ class StrucutedAddSVCSR(gof.Op):
}
}
"""% dict(locals(), **sub)
""" % dict(locals(), **sub)
structured_add_s_v_csr = StrucutedAddSVCSR()
@gof.local_optimizer([structured_add_s_v])
def local_structured_add_s_v(node):
if node.op == structured_add_s_v:
x, y = node.inputs
x_is_sparse_variable = _is_sparse_variable(x)
y_is_sparse_variable = _is_sparse_variable(y)
#y_is_sparse_variable = _is_sparse_variable(y)
if x_is_sparse_variable:
svar = x
dvar = y
svar = x
dvar = y
else:
svar = y
dvar = x
svar = y
dvar = x
if dvar.type.ndim != 1:
return False
elif svar.type.format == 'csr':
CSx = CSR
structured_add_s_v_csx = structured_add_s_v_csr
CSx = CSR
structured_add_s_v_csx = structured_add_s_v_csr
else:
raise NotImplemented()
s_val, s_ind, s_ptr, s_shape = csm_properties(svar)
c_data = structured_add_s_v_csx(s_val, s_ind, s_ptr, dvar)
return [CSx(c_data, s_ind, s_ptr, s_shape)]
return False
register_specialize(local_structured_add_s_v)
class SamplingDot(gof.op.Op):
"""
Operand for calculating the dot product DOT(X, Y) = Z when you only want to calculate
a subset of Z. It is equivalent to P o (X . Y) where o is the element-wise product, X and Y operands of
the dot product and P is a matrix that contains 1 when the corresponding element of Z should be calculated
and 0 when it shouldn't. Note that SamplingDot has a different interface than DOT because SamplingDot
requires X to be a MxK matrix while Y is a NxK matrix instead of the usual KxN matrix.
It will work if the pattern is not binary value, but if the pattern doesn't have a high sparsity proportion
it will be slower then a more optimized dot followed by a normal elemwise multiplication.
Operand for calculating the dot product DOT(X, Y) = Z when you
only want to calculate a subset of Z. It is equivalent to P o (X
. Y) where o is the element-wise product, X and Y operands of the
dot product and P is a matrix that contains 1 when the
corresponding element of Z should be calculated and 0 when it
shouldn't. Note that SamplingDot has a different interface than
DOT because SamplingDot requires X to be a MxK matrix while Y is a
NxK matrix instead of the usual KxN matrix.
It will work if the pattern is not binary value, but if the
pattern doesn't have a high sparsity proportion it will be slower
then a more optimized dot followed by a normal elemwise
multiplication.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'SamplingDot'
def make_node(self, x, y, p):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
if not _is_sparse_variable(p):
raise TypeError(p)
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p.type.dtype)
return gof.Apply(self, [x, y, p], [p.type()])
def perform(self, node, (x, y, p), (out,)):
if _is_sparse_variable(x):
raise TypeError(x)
if _is_sparse_variable(y):
raise TypeError(y)
if not _is_sparse(p):
raise TypeError(p)
rval = p.__class__(p.multiply(numpy.dot(x, y.T)))
out[0] = rval
def grad(self, (x, y, p), (gz,)):
rval = [
dot(gz, y),
dot(gz.T, x),
None
]
return rval
sampling_dot = SamplingDot()
class SamplingDotCsr(gof.Op):
"""
Optimized SamplingDot when the pattern P is a CSR matrix.
......@@ -374,13 +419,13 @@ class SamplingDotCsr(gof.Op):
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'SamplingDot{Csr}'
def make_node(self, x, y, p_data, p_ind, p_ptr, p_ncols):
x = tensor.as_tensor_variable(x)
y = tensor.as_tensor_variable(y)
......@@ -388,12 +433,13 @@ class SamplingDotCsr(gof.Op):
p_ind = tensor.as_tensor_variable(p_ind)
p_ptr = tensor.as_tensor_variable(p_ptr)
p_ncols = tensor.as_tensor_variable(p_ncols)
assert p_ncols.dtype == 'int32'
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype, p_data.type.dtype)
dtype_out = scalar.upcast(x.type.dtype, y.type.dtype,
p_data.type.dtype)
dot_out = scalar.upcast(x.type.dtype, y.type.dtype)
# We call blas ?dot function that take only param of the same type
x = tensor.cast(x, dot_out)
y = tensor.cast(y, dot_out)
......@@ -406,7 +452,7 @@ class SamplingDotCsr(gof.Op):
def c_support_code(self):
return blas.blas_header_text()
def c_libraries(self):
import pdb; pdb.set_trace()
return blas.ldflags()
......@@ -416,19 +462,24 @@ class SamplingDotCsr(gof.Op):
def c_lib_dirs(self):
return blas.ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return blas.ldflags(libs=False, include_dir=True)
def c_code(self, node, name, (x, y, p_data, p_ind, p_ptr, p_ncols), (z_data, z_ind, z_ptr), sub):
def c_code(self, node, name, inputs, outputs, sub):
x, y, p_data, p_ind, p_ptr, p_ncols = inputs
z_data, z_ind, z_ptr = outputs
if node.inputs[0].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for x')
if node.inputs[1].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for y')
if node.inputs[2].type.dtype in ('complex64', 'complex128'):
raise NotImplementedError('Complex types are not supported for pattern')
dot_out = scalar.upcast(node.inputs[0].type.dtype, node.inputs[0].type.dtype)
raise NotImplementedError(
'Complex types are not supported for pattern')
# TODO: why 2 times the same inputs?
dot_out = scalar.upcast(node.inputs[0].type.dtype,
node.inputs[0].type.dtype)
if dot_out == "float32":
conv_type = "float"
......@@ -436,13 +487,17 @@ class SamplingDotCsr(gof.Op):
else:
conv_type = "double"
cdot = "ddot_sub_"
typenum_x = node.inputs[0].type.dtype_specs()[-1] # retrieve dtype number
typenum_y = node.inputs[1].type.dtype_specs()[-1] # retrieve dtype number
typenum_p = node.inputs[2].type.dtype_specs()[-1] # retrieve dtype number
typenum_zd = tensor.TensorType(node.outputs[0].dtype, []).dtype_specs()[-1] # retrieve dtype number
typenum_zi = tensor.TensorType(node.outputs[1].dtype, []).dtype_specs()[-1] # retrieve dtype number
typenum_zp = tensor.TensorType(node.outputs[2].dtype, []).dtype_specs()[-1] # retrieve dtype number
# retrieve dtype number
typenum_x = node.inputs[0].type.dtype_specs()[-1]
typenum_y = node.inputs[1].type.dtype_specs()[-1]
typenum_p = node.inputs[2].type.dtype_specs()[-1]
typenum_zd = tensor.TensorType(node.outputs[0].dtype,
[]).dtype_specs()[-1]
typenum_zi = tensor.TensorType(node.outputs[1].dtype,
[]).dtype_specs()[-1]
typenum_zp = tensor.TensorType(node.outputs[2].dtype,
[]).dtype_specs()[-1]
rval = """
if (%(x)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(x) != 2"); %(fail)s;}
......@@ -529,13 +584,14 @@ class SamplingDotCsr(gof.Op):
Dzd[n_idx * Sdzd] *= Dpd[n_idx * Sdpd];
}
}
}
}
"""% dict(locals(), **sub)
""" % dict(locals(), **sub)
return rval
sampling_dot_csr = SamplingDotCsr()
# register a specialization to replace SamplingDot -> SamplingDotCsr
@gof.local_optimizer([sampling_dot])
def local_sampling_dot_csr(node):
......@@ -543,10 +599,10 @@ def local_sampling_dot_csr(node):
x, y, p = node.inputs
if p.type.format == 'csr':
p_data, p_ind, p_ptr, p_shape = csm_properties(p)
z_data, z_ind, z_ptr = sampling_dot_csr(x, y, p_data,
p_ind, p_ptr, p_shape[1])
return [CSR(z_data, z_ind, z_ptr, p_shape)]
return False
register_specialize(local_sampling_dot_csr, name='local_sampling_dot_csr')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论