提交 d653a636 authored 作者: desjagui@atchoum.iro.umontreal.ca's avatar desjagui@atchoum.iro.umontreal.ca

merge

aa.x : aa.cc aa.x : aa.cc
g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl -lcblas -lgoto -lgfortran -lm g++ -O3 -ffast-math aa.cc -o aa.x -L${PUB_PREFIX}/lib -lgsl ${THEANO_BLAS_LDFLAGS}
clean : clean :
rm aa.x rm aa.x
...@@ -28,6 +28,7 @@ int main(int argc, char **argv) ...@@ -28,6 +28,7 @@ int main(int argc, char **argv)
int neg = strtol(argv[1], 0, 0); int neg = strtol(argv[1], 0, 0);
int nout = strtol(argv[2], 0, 0); int nout = strtol(argv[2], 0, 0);
int nin = nout;
int nhid = strtol(argv[3], 0, 0); int nhid = strtol(argv[3], 0, 0);
int niter = strtol(argv[4], 0, 0); int niter = strtol(argv[4], 0, 0);
double lr = 0.01; double lr = 0.01;
...@@ -35,8 +36,8 @@ int main(int argc, char **argv) ...@@ -35,8 +36,8 @@ int main(int argc, char **argv)
gsl_rng_set(rng, 234); gsl_rng_set(rng, 234);
gsl_matrix * x = gsl_matrix_alloc(neg, nout); gsl_matrix * x = gsl_matrix_alloc(neg, nin);
gsl_matrix * w = gsl_matrix_alloc(nout, nhid); gsl_matrix * w = gsl_matrix_alloc(nin, nhid);
gsl_vector * a = gsl_vector_alloc(nhid); gsl_vector * a = gsl_vector_alloc(nhid);
gsl_vector * b = gsl_vector_alloc(nout); gsl_vector * b = gsl_vector_alloc(nout);
gsl_matrix * xw = gsl_matrix_alloc(neg, nhid); gsl_matrix * xw = gsl_matrix_alloc(neg, nhid);
...@@ -59,11 +60,17 @@ int main(int argc, char **argv) ...@@ -59,11 +60,17 @@ int main(int argc, char **argv)
struct timeval tv0, tv1; struct timeval tv0, tv1;
struct timeval tdot0, tdot1;
double time_of_dot = 0.0;
gettimeofday(&tv0, 0); gettimeofday(&tv0, 0);
double err = 0.0; double err = 0.0;
for (int iter = 0; iter < niter; ++iter) for (int iter = 0; iter < niter; ++iter)
{ {
gettimeofday(&tdot0, 0);
gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, x, w, 0.0, xw); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, x, w, 0.0, xw);
gettimeofday(&tdot1, 0);
time_of_dot += pytime(&tdot1) - pytime(&tdot0);
for (int i = 0; i < neg; ++i) for (int i = 0; i < neg; ++i)
for (int j = 0; j < nhid; ++j) for (int j = 0; j < nhid; ++j)
...@@ -72,7 +79,10 @@ int main(int argc, char **argv) ...@@ -72,7 +79,10 @@ int main(int argc, char **argv)
hid->data[i*nhid+j] = tanh(act); hid->data[i*nhid+j] = tanh(act);
} }
gettimeofday(&tdot0, 0);
gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, hid, w, 0.0, hidwt); gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, hid, w, 0.0, hidwt);
gettimeofday(&tdot1, 0);
time_of_dot += pytime(&tdot1) - pytime(&tdot0);
for (int i = 0; i < nout; ++i) g_b->data[i] = 0.0; for (int i = 0; i < nout; ++i) g_b->data[i] = 0.0;
err = 0.0; err = 0.0;
...@@ -90,8 +100,11 @@ int main(int argc, char **argv) ...@@ -90,8 +100,11 @@ int main(int argc, char **argv)
if (1) if (1)
{ {
gettimeofday(&tdot0, 0);
gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, g_hidwt, w, 0.0, g_hid); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, g_hidwt, w, 0.0, g_hid);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, g_hidwt, hid, 0.0, g_w); gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, g_hidwt, hid, 0.0, g_w);
gettimeofday(&tdot1, 0);
time_of_dot += pytime(&tdot1) - pytime(&tdot0);
for (int i = 0; i < neg; ++i) for (int i = 0; i < neg; ++i)
...@@ -101,14 +114,19 @@ int main(int argc, char **argv) ...@@ -101,14 +114,19 @@ int main(int argc, char **argv)
a->data[j] -= lr * g_hid->data[i*nhid+j]; a->data[j] -= lr * g_hid->data[i*nhid+j];
} }
gettimeofday(&tdot0, 0);
gsl_blas_dgemm(CblasTrans, CblasNoTrans, -lr, x, g_hid, 1.0, w); gsl_blas_dgemm(CblasTrans, CblasNoTrans, -lr, x, g_hid, 1.0, w);
gettimeofday(&tdot1, 0);
time_of_dot += pytime(&tdot1) - pytime(&tdot0);
for (int i = 0; i < nout*nhid; ++i) w->data[i] -= lr * g_w->data[i]; for (int i = 0; i < nout*nhid; ++i) w->data[i] -= lr * g_w->data[i];
} }
} }
gettimeofday(&tv1, 0); gettimeofday(&tv1, 0);
fprintf(stdout, "took = %lfs to get err %lf\n", pytime(&tv1) - pytime(&tv0), 0.5 * err); double total_time = pytime(&tv1) - pytime(&tv0);
fprintf(stdout, "took = %lfs to get err %lf\n", total_time, 0.5 * err);
fprintf(stdout, "... of which %.2lfs was spent in dgemm (fraction: %.2lf)\n", time_of_dot, time_of_dot / total_time);
//skip freeing //skip freeing
return 0; return 0;
} }
......
...@@ -8,7 +8,15 @@ import theano ...@@ -8,7 +8,15 @@ import theano
import theano.tensor as T import theano.tensor as T
import theano.sandbox import theano.sandbox
import theano.sandbox.wraplinker import theano.sandbox.wraplinker
from theano.compile import module from theano.compile import module, Mode
from theano.sandbox.wraplinker import ProfileMode
from theano import gof, Op, Apply
from theano.tensor import blas, opt
# numpy: aa_numpy.py
# c : aa.cc
if 0: if 0:
class Opt(object): class Opt(object):
...@@ -130,32 +138,29 @@ if 0: ...@@ -130,32 +138,29 @@ if 0:
self.merge(env) self.merge(env)
def linker(print_prog=False): def print_graph_linker(print_prog=True):
if 1: if 1:
print 'wtf?' imap = {None:'-'}
#return theano.gof.OpWiseCLinker() def blah(i, node, thunk):
imap = {None:'-'} imap[node] = str(i)
def blah(i, node, thunk): if print_prog:# and node.op.__class__ is T.DimShuffle:
imap[node] = str(i) if False and node.op == T.DimShuffle((), ['x', 'x'], inplace = True):
if print_prog:# and node.op.__class__ is T.DimShuffle: print node.op == T.DimShuffle((), ['x', 'x'], inplace = True),
if False and node.op == T.DimShuffle((), ['x', 'x'], inplace = True): print node.inputs[0], type(node.inputs[0]),
print node.op == T.DimShuffle((), ['x', 'x'], inplace = True), print node.inputs[0].equals(T.constant(2)),
print node.inputs[0], type(node.inputs[0]), outputs = node.outputs
print node.inputs[0].equals(T.constant(2)), inputs = theano.gof.graph.inputs(outputs)
outputs = node.outputs print 'node ', i, node,
inputs = theano.gof.graph.inputs(outputs) print ':'.join([imap[inp.owner] for inp in node.inputs])
print 'node ', i, node, #print theano.sandbox.pprint.pp.process_graph(inputs, outputs)
print ':'.join([imap[inp.owner] for inp in node.inputs]) return theano.sandbox.wraplinker.WrapLinkerMany(
#print theano.sandbox.pprint.pp.process_graph(inputs, outputs) [theano.gof.OpWiseCLinker()],
[theano.sandbox.wraplinker.run_all
return theano.sandbox.wraplinker.WrapLinkerMany( ,blah
[theano.gof.OpWiseCLinker()], #,theano.sandbox.wraplinker.numpy_notall_isfinite
[theano.sandbox.wraplinker.run_all ])
,blah else:
#,theano.sandbox.wraplinker.numpy_notall_isfinite return theano.gof.OpWiseCLinker()
])
else:
return theano.gof.OpWiseCLinker()
class M(module.Module): class M(module.Module):
...@@ -167,11 +172,14 @@ class M(module.Module): ...@@ -167,11 +172,14 @@ class M(module.Module):
self.a = module.Member(T.vector('a')) # hid bias self.a = module.Member(T.vector('a')) # hid bias
self.b = module.Member(T.vector('b')) # output bias self.b = module.Member(T.vector('b')) # output bias
hid = T.tanh(T.dot(x, self.w) + self.a) self.hid = T.tanh(T.dot(x, self.w) + self.a)
hid = self.hid
out = T.tanh(T.dot(hid, self.w.T) + self.b) self.out = T.tanh(T.dot(hid, self.w.T) + self.b)
out = self.out
err = 0.5 * T.sum((out - x)**2) self.err = 0.5 * T.sum((out - x)**2)
err = self.err
params = [self.w, self.a, self.b] params = [self.w, self.a, self.b]
...@@ -182,7 +190,13 @@ class M(module.Module): ...@@ -182,7 +190,13 @@ class M(module.Module):
self.step = module.Method([x], err, updates=dict(updates)) self.step = module.Method([x], err, updates=dict(updates))
mod = M() mod = M()
m = mod.make(mode='FAST_RUN') mode = 'FAST_RUN'
#mode = ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
mode = Mode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker(nice_errors=True))
mode = Mode(optimizer='fast_run', linker='c')
mode = Mode(optimizer='fast_run', linker='c|py')
print mod.pretty(mode=mode)
m = mod.make(mode=mode)
neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]] neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
rng = numpy.random.RandomState(342) rng = numpy.random.RandomState(342)
...@@ -196,4 +210,10 @@ t = time.time() ...@@ -196,4 +210,10 @@ t = time.time()
for i in xrange(niter): for i in xrange(niter):
err = m.step(x) err = m.step(x)
print 'time: ',time.time() - t, 'err: ', err print 'time: ',time.time() - t, 'err: ', err
try:
mode.print_summary()
pass
except:
pass
#!/usr/bin/env python2.5
from __future__ import absolute_import
import numpy as N
import sys
import time
# c: aa.cc
neg, nout, nhid, niter = [int(a) for a in sys.argv[1:]]
lr = 0.01
rng = N.random.RandomState(342)
w = rng.rand(nout, nhid)
a = rng.randn(nhid) * 0.0
b = rng.randn(nout) * 0.0
x = (rng.rand(neg, nout)-0.5) * 1.5
dot_time = 0.0
t = time.time()
for i in xrange(niter):
tt = time.time()
d = N.dot(x, w)
dot_time += time.time() - tt
hid = N.tanh(d + a)
tt = time.time()
d = N.dot(hid, w.T)
dot_time += time.time() - tt
out = N.tanh(d + b)
g_out = out - x
err = 0.5 * N.sum(g_out**2)
g_hidwt = g_out * (1.0 - out**2)
b -= lr * N.sum(g_hidwt, axis=0)
tt = time.time()
g_hid = N.dot(g_hidwt, w)
dot_time += time.time() - tt
g_hidin = g_hid * (1.0 - hid**2)
tt = time.time()
d = N.dot(g_hidwt.T, hid)
dd = N.dot(x.T, g_hidin)
dot_time += time.time() - tt
gw = (d + dd)
w -= lr * gw
a -= lr * N.sum(g_hidin, axis=0)
total_time = time.time() - t
print 'time: ',total_time, 'err: ', err
print ' of which', dot_time, 'was spent on dot. Fraction:', dot_time / total_time
...@@ -89,8 +89,9 @@ Get the source and run the tests like this: ...@@ -89,8 +89,9 @@ Get the source and run the tests like this:
.. code-block:: bash .. code-block:: bash
hg clone http://pylearn.org/hg/theano theano hg clone http://pylearn.org/hg/theano Theano
cd theano ln -s Theano/theano <someplace on your PYTHONPATH>/theano
cd Theano
nosetests nosetests
To update your library to the latest on pylearn.org, change directory (`cd`) to this `theano` folder and type To update your library to the latest on pylearn.org, change directory (`cd`) to this `theano` folder and type
......
...@@ -664,6 +664,10 @@ class ComponentList(Composite): ...@@ -664,6 +664,10 @@ class ComponentList(Composite):
return self.__class__(*[c.dup() for c in self._components]) return self.__class__(*[c.dup() for c in self._components])
def default_initialize(self, init = {}, **kwinit):
for k, initv in dict(init, **kwinit).iteritems():
self[k] = initv
class ComponentDictInstance(CompositeInstance): class ComponentDictInstance(CompositeInstance):
""" """
ComponentDictInstance is meant to be instantiated by ComponentDict. ComponentDictInstance is meant to be instantiated by ComponentDict.
......
...@@ -23,11 +23,12 @@ from op import \ ...@@ -23,11 +23,12 @@ from op import \
from opt import \ from opt import \
Optimizer, optimizer, SeqOptimizer, \ Optimizer, optimizer, SeqOptimizer, \
MergeOptimizer, MergeOptMerge, \ MergeOptimizer, MergeOptMerge, \
LocalOptimizer, local_optimizer, LocalOptGroup, LocalOpKeyOptGroup, \ LocalOptimizer, local_optimizer, LocalOptGroup, \
OpSub, OpRemove, PatternSub, \ OpSub, OpRemove, PatternSub, \
NavigatorOptimizer, TopoOptimizer, OpKeyOptimizer, EquilibriumOptimizer, \ NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer, \
keep_going, warn, \ keep_going, warn, \
InplaceOptimizer, PureThenInplaceOptimizer InplaceOptimizer, PureThenInplaceOptimizer
#LocalOpKeyOptGroup, OpKeyOptimizer
from optdb import \ from optdb import \
DB, Query, \ DB, Query, \
......
...@@ -686,7 +686,16 @@ class CLinker(link.Linker): ...@@ -686,7 +686,16 @@ class CLinker(link.Linker):
instantiate.customize.add_support_code(support_code) instantiate.customize.add_support_code(support_code)
instantiate.customize.add_support_code(self.struct_code) instantiate.customize.add_support_code(self.struct_code)
instantiate.customize.add_support_code(static) instantiate.customize.add_support_code(static)
instantiate.customize.add_extra_compile_arg("-w") for extra_arg in (
"-O2",
"-ffast-math",
#"-fprefetch-loop-arrays",
#"-ftree-vect-loop-version",
#"-ftree-loop-optimize",
#"-ftree-vectorize"):
"-w" #-w means supress all warnings
):
instantiate.customize.add_extra_compile_arg(extra_arg)
for arg in self.compile_args(): for arg in self.compile_args():
instantiate.customize.add_extra_compile_arg(arg) instantiate.customize.add_extra_compile_arg(arg)
for header in self.headers(): for header in self.headers():
...@@ -739,6 +748,7 @@ def _execute(cthunk, init_tasks, tasks, error_storage): ...@@ -739,6 +748,7 @@ def _execute(cthunk, init_tasks, tasks, error_storage):
exc_value = exc_type(_exc_value, task) exc_value = exc_type(_exc_value, task)
exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared
raise exc_type, exc_value, exc_trace raise exc_type, exc_value, exc_trace
execute.cthunk = cthunk
return execute return execute
...@@ -761,9 +771,12 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -761,9 +771,12 @@ class OpWiseCLinker(link.LocalLinker):
__cache__ = {} __cache__ = {}
def __init__(self, fallback_on_perform = True): def __init__(self,
fallback_on_perform = True,
nice_errors = True):
self.env = None self.env = None
self.fallback_on_perform = fallback_on_perform self.fallback_on_perform = fallback_on_perform
self.nice_errors = nice_errors
def accept(self, env, no_recycling = []): def accept(self, env, no_recycling = []):
if self.env is not None and self.env is not env: if self.env is not None and self.env is not env:
...@@ -833,7 +846,9 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -833,7 +846,9 @@ class OpWiseCLinker(link.LocalLinker):
else: else:
no_recycling = [storage_map[r] for r in no_recycling if r not in env.inputs] no_recycling = [storage_map[r] for r in no_recycling if r not in env.inputs]
f = link.streamline(env, thunks, order, no_recycling = no_recycling, profiler = profiler) f = link.streamline(env, thunks, order,
no_recycling = no_recycling,
nice_errors = self.nice_errors)
return f, [link.Container(input, storage) for input, storage in zip(env.inputs, input_storage)], \ return f, [link.Container(input, storage) for input, storage in zip(env.inputs, input_storage)], \
[link.Container(output, storage, True) for output, storage in zip(env.outputs, output_storage)], \ [link.Container(output, storage, True) for output, storage in zip(env.outputs, output_storage)], \
...@@ -841,7 +856,6 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -841,7 +856,6 @@ class OpWiseCLinker(link.LocalLinker):
def _default_checker(x, y): def _default_checker(x, y):
"""WRITEME """WRITEME
Default checker for DualLinker. This checks that the Default checker for DualLinker. This checks that the
......
...@@ -13,6 +13,7 @@ from collections import deque ...@@ -13,6 +13,7 @@ from collections import deque
import utils import utils
_creation_idx = [0]
class Apply(utils.object2): class Apply(utils.object2):
""" """
...@@ -121,6 +122,13 @@ class Apply(utils.object2): ...@@ -121,6 +122,13 @@ class Apply(utils.object2):
def __asapply__(self): def __asapply__(self):
return self return self
def __hash__(self):
if not hasattr(self, '_creation_idx'):
self._creation_idx = _creation_idx[0]
_creation_idx[0] += 1
return self._creation_idx
def clone(self): def clone(self):
"""Duplicate this Apply instance with inputs = self.inputs. """Duplicate this Apply instance with inputs = self.inputs.
...@@ -567,7 +575,10 @@ def general_toposort(r_out, deps, debug_print = False): ...@@ -567,7 +575,10 @@ def general_toposort(r_out, deps, debug_print = False):
deps(i) should behave like a pure function (no funny business with internal state) deps(i) should behave like a pure function (no funny business with internal state)
:note: :note:
deps(i) can/should be cached by the deps function to be fast deps(i) will be cached by this function (to be fast)
:note:
The order of the return value list is determined by the order of nodes returned by the deps() function.
""" """
deps_cache = {} deps_cache = {}
def _deps(io): def _deps(io):
...@@ -611,8 +622,9 @@ def general_toposort(r_out, deps, debug_print = False): ...@@ -611,8 +622,9 @@ def general_toposort(r_out, deps, debug_print = False):
def io_toposort(i, o, orderings = {}): def io_toposort(i, o, orderings = {}):
"""WRITEME """WRITEME
""" """
#the inputs are used only here in the function that decides what 'predecessors' to explore
iset = set(i) iset = set(i)
def deps(obj): def deps(obj):
rval = [] rval = []
if obj not in iset: if obj not in iset:
if isinstance(obj, Result): if isinstance(obj, Result):
......
...@@ -5,6 +5,7 @@ from type import Type ...@@ -5,6 +5,7 @@ from type import Type
import sys, traceback import sys, traceback
from copy import copy from copy import copy
from cutils import run_cthunk
__excepthook = sys.excepthook __excepthook = sys.excepthook
...@@ -225,9 +226,27 @@ def clear_storage_thunk(stg): ...@@ -225,9 +226,27 @@ def clear_storage_thunk(stg):
thunk.inputs = [stg] thunk.inputs = [stg]
return thunk return thunk
def streamline(env, thunks, order, no_recycling = [], profiler = None): def streamline(env, thunks, order, no_recycling = [], profiler = None, nice_errors = True):
"""WRITEME""" """WRITEME
if profiler is None:
:param env:
:param thunks: the list of program instructions
:param order: the list of apply instances that gave rise to the thunks (same order as thunks)
:param no_recycling: storage elements that cannot be 'recycled' by repeatedly executing the
program. These storage elements are cleared before re-running.
:param profiler: deprecated
:param nice_errors: run in such a way that the double-traceback is printed. This costs a
bit of performance in the inner python loop.
"""
if profiler is not None:
raise NotImplementedError()
if nice_errors:
def f(): def f():
for x in no_recycling: for x in no_recycling:
x[0] = None x[0] = None
...@@ -237,14 +256,13 @@ def streamline(env, thunks, order, no_recycling = [], profiler = None): ...@@ -237,14 +256,13 @@ def streamline(env, thunks, order, no_recycling = [], profiler = None):
except: except:
raise_with_op(node) raise_with_op(node)
else: else:
# don't worry about raise_with_op, just go a little faster.
#there is a mix of python and c thunks
def f(): def f():
for x in no_recycling: for x in no_recycling:
x[0] = None x[0] = None
def g(): for thunk in thunks:
for thunk, node in zip(thunks, order): thunk()
profiler.profile_node(thunk, node)
profiler.profile_env(g, env)
f.profiler = profiler
return f return f
class LocalLinker(Linker): class LocalLinker(Linker):
......
差异被折叠。
...@@ -4,16 +4,31 @@ import opt ...@@ -4,16 +4,31 @@ import opt
class DB(object): class DB(object):
def __hash__(self):
if not hasattr(self, '_optimizer_idx'):
self._optimizer_idx = opt._optimizer_idx[0]
opt._optimizer_idx[0] += 1
return self._optimizer_idx
def __init__(self): def __init__(self):
self.__db__ = defaultdict(set) self.__db__ = defaultdict(set)
self._names = set()
def register(self, name, obj, *tags): def register(self, name, obj, *tags):
# N.B. obj is not an instance of class Optimizer.
# It is an instance of a DB.In the tests for example,
# this is not always the case.
if not isinstance(obj, (DB, opt.Optimizer, opt.LocalOptimizer)):
raise Exception('wtf', obj)
obj.name = name obj.name = name
if name in self.__db__: if name in self.__db__:
raise ValueError('The name of the object cannot be an existing tag or the name of another existing object.', obj, name) raise ValueError('The name of the object cannot be an existing tag or the name of another existing object.', obj, name)
self.__db__[name] = set([obj]) self.__db__[name] = set([obj])
self._names.add(name)
for tag in tags: for tag in tags:
if tag in self._names:
raise ValueError('The tag of the object collides with a name.', obj, tag)
self.__db__[tag].add(obj) self.__db__[tag].add(obj)
def __query__(self, q): def __query__(self, q):
......
if 0:
class _EquilibriumOptimizer(NavigatorOptimizer):
def __init__(self,
local_optimizers,
failure_callback = None,
max_depth = None,
max_use_ratio = None):
super(EquilibriumOptimizer, self).__init__(
None,
ignore_newtrees = False,
failure_callback = failure_callback)
self.local_optimizers = local_optimizers
self.max_depth = max_depth
self.max_use_ratio = max_use_ratio
self.tracks = defaultdict(list)
self.tracks0 = defaultdict(list)
max_depth = 0
for lopt in local_optimizers:
tracks = lopt.tracks()
for track in tracks:
max_depth = max(max_depth, len(track))
if self.max_depth is not None and max_depth > self.max_depth:
raise ValueError('One of the local optimizers exceeds the maximal depth.')
for i, op in enumerate(track):
if i == 0:
self.tracks0[op].append((track, i, lopt))
self.tracks[op].append((track, i, lopt))
def fetch_tracks(self, op):
return self.tracks[op] + self.tracks[None]
def fetch_tracks0(self, op):
return self.tracks0[op] + self.tracks0[None]
def backtrack(self, node, tasks):
candidates = self.fetch_tracks(node.op)
tracks = []
def filter(node, depth):
new_candidates = []
for candidate in candidates:
track, i, lopt = candidate
if i < depth:
pass
elif track[i-depth] in (None, node.op):
if i == depth:
tasks[node].append(lopt)
else:
tracks.append(candidate)
else:
new_candidates.append(candidate)
return new_candidates
depth = 0
nodes = [node]
while candidates:
for node in nodes:
candidates = filter(node, depth)
depth += 1
_nodes = nodes
nodes = reduce(list.__iadd__,
[reduce(list.__iadd__,
[[n for n, i in out.clients if not isinstance(n, str)] for out in node.outputs],
[]) for node in nodes],
[])
candidates = tracks
tracks = []
def apply(self, env):
tasks = defaultdict(list)
if self.max_use_ratio is not None:
max_uses = self.max_use_ratio * len(env.nodes)
runs = defaultdict(int)
else:
runs = None
def importer(node):
#print 'IMPORTING', node
self.backtrack(node, tasks)
def pruner(node):
try:
del tasks[node]
except KeyError:
pass
def chin(node, i, r, new_r):
if new_r.owner and not r.clients:
self.backtrack(new_r.owner, tasks)
# # == NOT IDEAL == #
# for node in env.nodes:
# importer(node)
for node in env.toposort():
tasks[node].extend(lopt for track, i, lopt in self.fetch_tracks0(node.op))
u = self.attach_updater(env, importer, pruner, chin)
print 'KEYS', map(hash, tasks.keys())
while tasks:
for node in tasks.iterkeys():
todo = tasks.pop(node)
break
for lopt in todo:
if runs is not None and runs[lopt] >= max_uses:
print >>sys.stderr, 'Warning: optimization exceeded its maximal use ratio: %s, %s' % (lopt, max_uses)
continue
success = self.process_node(env, node, lopt)
if success:
if runs is not None: runs[lopt] += 1
break
self.detach_updater(env, u)
# def match(self, node, candidates):
# candidates[:] = [candidate
# for candidate in candidates
# if candidate.current.op is None or candidate.current.op == node.op]
# for candidate in candidates:
# if candidate.current.inputs is not None:
# for in1, in2 in zip(candidate.current.inputs, node.inputs):
# if isinstance(in1, str):
# candidate.match[in1] = in2
# for client in node.clients:
# op = node.op
# patterns = self.pattern_base[(depth, op)].union(self.pattern_base[(depth, WILDCARD)])
# if not patterns:
# return patterns
# return self.match(node, depth + 1).intersection(patterns)
# def backtrack(self, node, q):
# for node2, i in node.clients:
# op2 = node2.op
...@@ -375,7 +375,7 @@ class TestEquilibrium(object): ...@@ -375,7 +375,7 @@ class TestEquilibrium(object):
x, y, z = map(MyResult, 'xyz') x, y, z = map(MyResult, 'xyz')
e = op3(op4(x, y)) e = op3(op4(x, y))
g = Env([x, y, z], [e]) g = Env([x, y, z], [e])
print g print 'before', g
sys.stderr = sys.stdout # display pesky warnings along with stdout sys.stderr = sys.stdout # display pesky warnings along with stdout
opt = EquilibriumOptimizer( opt = EquilibriumOptimizer(
[PatternSub((op1, 'x', 'y'), (op2, 'x', 'y')), [PatternSub((op1, 'x', 'y'), (op2, 'x', 'y')),
...@@ -384,7 +384,7 @@ class TestEquilibrium(object): ...@@ -384,7 +384,7 @@ class TestEquilibrium(object):
], ],
max_use_ratio = 1. / len(g.nodes)) # each opt can only be applied once max_use_ratio = 1. / len(g.nodes)) # each opt can only be applied once
opt.optimize(g) opt.optimize(g)
print g print 'after', g
assert str(g) == '[Op4(x, y)]' assert str(g) == '[Op4(x, y)]'
......
from theano.gof.optdb import *
from unittest import TestCase
class Test_DB(TestCase):
def test_0(self):
class Opt(opt.Optimizer): #inheritance buys __hash__
name = 'blah'
db = DB()
db.register('a', Opt())
db.register('b', Opt())
db.register('c', Opt(), 'z', 'asdf')
try:
db.register('c', Opt()) #name taken
self.fail()
except ValueError, e:
if e[0].startswith("The name"):
pass
else:
raise
except:
self.fail()
try:
db.register('z', Opt()) #name collides with tag
self.fail()
except ValueError, e:
if e[0].startswith("The name"):
pass
else:
raise
except:
self.fail()
try:
db.register('u', Opt(), 'b') #name new but tag collides with name
self.fail()
except ValueError, e:
if e[0].startswith("The tag"):
pass
else:
raise
except:
self.fail()
...@@ -2,6 +2,7 @@ import gof #, gof.result ...@@ -2,6 +2,7 @@ import gof #, gof.result
import numpy #for numeric_grad import numpy #for numeric_grad
from gof.python25 import all from gof.python25 import all
import gof.utils
_msg_retType = 'op.grad(...) returned a non-list' _msg_retType = 'op.grad(...) returned a non-list'
_msg_badlen = 'op.grad(...) returned wrong number of gradients' _msg_badlen = 'op.grad(...) returned wrong number of gradients'
...@@ -55,17 +56,17 @@ def grad_sources_inputs(sources, graph_inputs): ...@@ -55,17 +56,17 @@ def grad_sources_inputs(sources, graph_inputs):
else: else:
gmap[r] = g_r gmap[r] = g_r
graph_outputs = gmap.keys() graph_outputs = gof.utils.uniq([r for r,g in sources])
if graph_inputs is None: if graph_inputs is None:
graph_inputs = gof.graph.inputs(graph_outputs) graph_inputs = gof.graph.inputs(graph_outputs)
for node in gof.graph.io_toposort(graph_inputs, graph_outputs).__reversed__(): for node in gof.graph.io_toposort(graph_inputs, graph_outputs).__reversed__():
g_outputs = [gmap.get(o,None) for o in node.outputs] g_outputs = [gmap.get(o,None) for o in node.outputs]
#if all output gradients are None, continue #if all output gradients are None, continue
if all(map(lambda x:x is None, g_outputs)): continue if all(map(lambda x:x is None, g_outputs)): continue
output_arg = g_outputs output_arg = g_outputs
input_arg = node.inputs input_arg = node.inputs
......
...@@ -235,17 +235,27 @@ class PPrinter: ...@@ -235,17 +235,27 @@ class PPrinter:
else: else:
raise TypeError('Not enough arguments to call.') raise TypeError('Not enough arguments to call.')
use_ascii = True
if use_ascii:
special = dict(middle_dot = u"\u00B7", special = dict(middle_dot = "\dot",
big_sigma = u"\u03A3") big_sigma = "\Sigma")
greek = dict(alpha = u"\u03B1", greek = dict(alpha = "\alpha",
beta = u"\u03B2", beta = "\beta",
gamma = u"\u03B3", gamma = "\gamma",
delta = u"\u03B4", delta = "\delta",
epsilon = u"\u03B5") epsilon = "\epsilon")
else:
special = dict(middle_dot = u"\u00B7",
big_sigma = u"\u03A3")
greek = dict(alpha = u"\u03B1",
beta = u"\u03B2",
gamma = u"\u03B3",
delta = u"\u03B4",
epsilon = u"\u03B5")
pprint = PPrinter() pprint = PPrinter()
......
...@@ -2,6 +2,7 @@ from __future__ import absolute_import ...@@ -2,6 +2,7 @@ from __future__ import absolute_import
import time import time
import numpy import numpy
from ..gof.cutils import run_cthunk
from ..gof.link import WrapLinker from ..gof.link import WrapLinker
from ..compile.mode import Mode from ..compile.mode import Mode
...@@ -103,49 +104,82 @@ def DualLinker(linkers): ...@@ -103,49 +104,82 @@ def DualLinker(linkers):
class ProfileMode(Mode): class ProfileMode(Mode):
def __init__(self, local_linker, optimizer=None): def __init__(self, linker, optimizer=None):
local_time = [0.0] local_time = [0.0]
apply_time = {} apply_time = {}
op_time = {} op_time = {}
op_cimpl = {}
def blah(i, node, *thunks): def blah(i, node, *thunks):
t0 = time.time() if 0:
for th in thunks: t0 = time.time()
th() for th in thunks:
dt = time.time() - t0 th()
dt = time.time() - t0
elif 0: #more precise timing
for th in thunks:
t0 = time.time()
th()
dt = time.time() - t0
elif 1:
for th in thunks:
if hasattr(th, 'cthunk'):
t0 = time.time()
run_cthunk(th.cthunk)
dt = time.time() - t0
else:
t0 = time.time()
th()
dt = time.time() - t0
elif 1:
pass
else:
raise Exception('one of the cases has to run the thunks!')
local_time[0] += dt local_time[0] += dt
apply_time[(i,node.op)] = apply_time.get((i,node.op), 0.0) + dt apply_time[(i,node.op)] = apply_time.get((i,node.op), 0.0) + dt
op_time[node.op] = op_time.get(node.op, 0.0) + dt op_time[node.op] = op_time.get(node.op, 0.0) + dt
op_cimpl[node.op] = hasattr(thunks[0], 'cthunk')
self.local_time = local_time self.local_time = local_time
self.apply_time = apply_time self.apply_time = apply_time
self.op_time = op_time self.op_time = op_time
self.op_cimpl = op_cimpl
linker = WrapLinkerMany([local_linker], [blah]) wrap_linker = WrapLinkerMany([linker], [blah])
if optimizer: if optimizer:
Mode.__init__(self, linker, optimizer) super(ProfileMode, self).__init__(wrap_linker, optimizer)
else: else:
Mode.__init__(self, linker) super(ProfileMode, self).__init__(wrap_linker)
def print_summary(self): def print_summary(self):
local_time = self.local_time[0] local_time = self.local_time[0]
apply_time = self.apply_time apply_time = self.apply_time
op_time = self.op_time op_time = self.op_time
print 'local_time', local_time print ''
print 'apply-wise times' print 'ProfileMode.print_summary()'
print '---------------------------'
print ''
print 'local_time', local_time, '(Time spent running thunks)'
print 'Apply-wise summary: <fraction of local_time spent at this position> (<Apply position>, <Apply Op name>)'
atimes = [(t/local_time, (a[0], str(a[1]))) for a, t in apply_time.items()] atimes = [(t/local_time, (a[0], str(a[1]))) for a, t in apply_time.items()]
atimes.sort() atimes.sort()
atimes.reverse() atimes.reverse()
for t,a in atimes[:15]: for t,a in atimes[:15]:
print ' ', t, a print '\t%.3f\t%i\t%s' % (t, a[0], a[1])
print ' ...' #show that we are ignoring applies that don't take much time print ' ... (remaining %i Apply instances account for %.2f of the runtime)'\
print 'op-wise times' %(max(0, len(atimes)-15), sum(t for t, a in atimes[15:]))
otimes = [(t/local_time, a) for a, t in op_time.items()]
n_ops_to_print = 20
print 'Op-wise summary: <fraction of local_time spent on this kind of Op> <Op name>'
otimes = [(t/local_time, a, self.op_cimpl[a]) for a, t in op_time.items()]
otimes.sort() otimes.sort()
otimes.reverse() otimes.reverse()
for t,a in otimes[:15]: for t,a,ci in otimes[:n_ops_to_print]:
print ' ', t, a print '\t%.3f\t%s %s' % (t, '*' if ci else ' ', a)
print ' ...' #show that we are ignoring applies that don't take much time print ' ... (remaining %i Ops account for %.2f of the runtime)'\
print sum(t for a,t in op_time.items()) %(max(0, len(otimes)-n_ops_to_print), sum(t for t, a, ci in
otimes[n_ops_to_print:]))
print '(*) Op is running a c implementation'
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from basic import * from basic import *
import opt import opt
import blas
import raw_random import raw_random
from raw_random import \ from raw_random import \
......
差异被折叠。
差异被折叠。
差异被折叠。
...@@ -103,16 +103,18 @@ class DimShuffle(Op): ...@@ -103,16 +103,18 @@ class DimShuffle(Op):
for i, b in enumerate(input_broadcastable): for i, b in enumerate(input_broadcastable):
if i not in new_order: if i not in new_order:
# we want to drop this dimension because it's not a value in new_order # we want to drop this dimension because it's not a value in new_order
if b == 1: if b == 1: # 1 aka True
self.drop.append(i) self.drop.append(i)
else: else:
# we cannot drop non-broadcastable dimensions # we cannot drop non-broadcastable dimensions
raise NotImplementedError("You cannot drop a non-broadcastable dimension.") raise ValueError("You cannot drop a non-broadcastable dimension.")
else: else:
i2j[i] = j i2j[i] = j
j += 1 j += 1
# transposition of non-broadcastable dimensions # transposition of non-broadcastable dimensions
# This is how the dimensions will be permuted, without accounting for the extra
# 'x' broadcastable dimensions to insert.
self.shuffle = [i2j[x] for x in new_order if x != 'x'] self.shuffle = [i2j[x] for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not in the original input # list of dimensions of the output that are broadcastable and were not in the original input
...@@ -144,7 +146,8 @@ class DimShuffle(Op): ...@@ -144,7 +146,8 @@ class DimShuffle(Op):
and self.input_broadcastable == other.input_broadcastable and self.input_broadcastable == other.input_broadcastable
def __hash__(self): def __hash__(self):
return hash(self.inplace) ^ hash(self.new_order) ^ hash(self.input_broadcastable) return hash(type(self)) ^ hash(self.inplace) \
^ hash(self.new_order) ^ hash(self.input_broadcastable)
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
...@@ -175,13 +178,78 @@ class DimShuffle(Op): ...@@ -175,13 +178,78 @@ class DimShuffle(Op):
storage[0] = res storage[0] = res
def c_code(self, node, name, (input,), (res,), sub):
def statements(lst):
return ';\n'.join(lst) + ';'
nd_in = len(self.input_broadcastable)
nd_out = len(self.new_order)
check_input_nd = [('if (%(input)s->nd != ' + str(nd_in) + ')'
'{PyErr_SetString(PyExc_NotImplementedError, "input nd"); %(fail)s;}')]
clear_output = ['if (%(res)s) {Py_XDECREF(%(res)s);}']
shape_statements = ['npy_intp dimensions[%i]'%nd_out]
shape_statements += [('dimensions['+str(i)+'] = %(input)s->dimensions['+str(o)+']')
if o != 'x' else
('dimensions['+str(i)+'] = 1')
for i, o in enumerate(self.new_order)]
strides_statements = ['npy_intp strides[%i]'%nd_out]
strides_statements += [('strides['+str(i)+'] = %(input)s->strides['+str(o)+']')
if o != 'x' else
('strides['+str(i)+'] = 0')
for i, o in enumerate(self.new_order)]
if self.inplace:
get_base = ['{ PyArrayObject * base = %(input)s', 'Py_INCREF((PyObject*)base)']
else:
get_base = [('{ PyArrayObject * base = (PyArrayObject*)PyArray_FromAny((PyObject*)%(input)s, NULL,'
'0, 0, NPY_ALIGNED|NPY_ENSURECOPY, NULL)')]
alloc_output = [('%(res)s = (PyArrayObject*)PyArray_New(&PyArray_Type, '
'' + str(nd_out) + ', dimensions, '
'PyArray_TYPE(base), strides, '
'base->data, base->descr->elsize, '
'PyArray_FLAGS(base), NULL)'),
'%(res)s->base = (PyObject*)base',
'}']
full_code = statements(check_input_nd
+ clear_output
+ shape_statements
+ strides_statements
+ get_base
+ alloc_output)
if 0:
print 'C_CODE'
print ''
print self
print "IN BROAD", self.input_broadcastable
print "NEW ORDER", self.new_order
print "SHUFFLE", self.shuffle
print "AUGMENT", self.augment
print '------------'
print ''
print full_code
if 0:
import sys
sys.exit()
return full_code % dict(locals(), **sub)
def grad(self, (x, ), (gz, )): def grad(self, (x, ), (gz, )):
gz = as_tensor(gz) gz = as_tensor(gz)
grad_order = ['x'] * len(x.type.broadcastable) grad_order = ['x'] * len(x.type.broadcastable)
for i, v in enumerate(self.new_order): for i, v in enumerate(self.new_order):
if v != 'x': if v != 'x':
grad_order[v] = i grad_order[v] = i
return DimShuffle(gz.type.broadcastable, grad_order)(gz), return [DimShuffle(gz.type.broadcastable, grad_order, inplace=True)(Elemwise(scalar.identity)(gz))]
......
from basic import _scal_elemwise, _transpose_inplace from .basic import _scal_elemwise #, _transpose_inplace
from .. import scalar as scal from .. import scalar as scal
import elemwise import elemwise
from .. import printing from .. import printing
...@@ -183,9 +183,11 @@ pprint.assign(div_inplace, printing.OperatorPrinter('/=', -1, 'left')) ...@@ -183,9 +183,11 @@ pprint.assign(div_inplace, printing.OperatorPrinter('/=', -1, 'left'))
pprint.assign(pow_inplace, printing.OperatorPrinter('**=', 1, 'right')) pprint.assign(pow_inplace, printing.OperatorPrinter('**=', 1, 'right'))
transpose_inplace = _transpose_inplace def transpose_inplace(x, **kwargs):
"""WRITEME""" """Perform a transpose on a tensor without copying the underlying storage"""
dims = range(x.ndim-1, -1, -1)
return elemwise.DimShuffle(x.broadcastable, dims, inplace=True)(x)
pprint.assign(transpose_inplace, printing.MemberPrinter('T')) #pprint.assign(transpose_inplace, printing.MemberPrinter('T'))
...@@ -203,6 +203,7 @@ class SoftmaxWithBias(gof.Op): ...@@ -203,6 +203,7 @@ class SoftmaxWithBias(gof.Op):
for (j = 0; j < Nx[1]; ++j) for (j = 0; j < Nx[1]; ++j)
{ {
double row_ij = x_i[j * Sx] + b_i[j * Sb]; double row_ij = x_i[j * Sx] + b_i[j * Sb];
// std::cout << "1" << row_ij << "\\n";
row_max_j = (row_ij > row_max) ? j : row_max_j; row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
} }
...@@ -210,13 +211,23 @@ class SoftmaxWithBias(gof.Op): ...@@ -210,13 +211,23 @@ class SoftmaxWithBias(gof.Op):
for (j = 0; j < Nx[1]; ++j) for (j = 0; j < Nx[1]; ++j)
{ {
double row_ij = x_i[j * Sx] + b_i[j * Sb]; double row_ij = x_i[j * Sx] + b_i[j * Sb];
// std::cout << "2" << row_ij << "\\n";
double sm_ij = exp(row_ij - row_max); double sm_ij = exp(row_ij - row_max);
// std::cout << "3" << sm_ij << "\\n";
sum += sm_ij; sum += sm_ij;
sm_i[j * Ssm] = sm_ij; sm_i[j * Ssm] = sm_ij;
} }
if ( (0.0 == sum) || (std::isinf(sum))) if (std::isinf(sum))
{ {
//that was our best... //that was our best...
PyErr_SetString(PyExc_ValueError, "softmax is impossible (inf)!");
%(fail)s;
}
if (0.0 == sum)
{
//that was our best...
PyErr_SetString(PyExc_ValueError, "softmax is impossible (zero)!");
%(fail)s; %(fail)s;
} }
...@@ -600,6 +611,7 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op): ...@@ -600,6 +611,7 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
} }
if (y_i >= %(dx)s->dimensions[1]) if (y_i >= %(dx)s->dimensions[1])
{ {
PyErr_SetString(PyExc_ValueError, "y_i >= dx dimensions[1]");
%(fail)s; %(fail)s;
} }
dx_i[y_i * Sdx] -= dnll_i; dx_i[y_i * Sdx] -= dnll_i;
......
"""Tensor optimizations addressing the ops in basic.py
"""
# TODO: intelligent merge for mul/add # TODO: intelligent merge for mul/add
# TODO: 0*x -> 0 # TODO: 0*x -> 0
...@@ -30,28 +31,6 @@ def in2out(*local_opts, **kwargs): ...@@ -30,28 +31,6 @@ def in2out(*local_opts, **kwargs):
**kwargs) **kwargs)
# gemm: (d,a,b,c,s) -> d = d*s + a*dot(b,c)
# Transforms d -= a * dot(b, c) into gemm(d, -a, b, c, 1.0)
gemm_pattern_1 = gof.PatternSub((T.sub,
'd',
(T.mul,
dict(pattern = (T.DimShuffle((), ['x', 'x'], inplace = True), 'a'),
allow_multiple_clients = True),
(T.dot, 'b', 'c'))),
(T.gemm, 'd', (T.neg, 'a'), 'b', 'c', T.constant(1.0)),
allow_multiple_clients = False)
# gemm: (d,a,b,c,s) -> d = d*s + a*dot(b,c)
# Transforms dot(a, b) into gemm(zeros(2)(hstack(shape(a)[:1], shape(b)[1:])), 1.0, a, b, 1.0)
# The construction of the 'gemm' node may fail if, for example, a and b are not both matrices.
dot_to_gemm = gof.PatternSub((T.dot, 'a', 'b'),
(T.gemm, (T.Zeros(2),
(T.stack,
(T.Subtensor([slice(0, 1)]), (T.shape, 'a')),
(T.Subtensor([slice(1, 2)]), (T.shape, 'b')))),
T.constant(1.0), 'a', 'b', T.constant(1.0)),
allow_multiple_clients = False)
def _insert_inplace_optimizer(env): def _insert_inplace_optimizer(env):
""" """
...@@ -91,12 +70,6 @@ def _insert_inplace_optimizer(env): ...@@ -91,12 +70,6 @@ def _insert_inplace_optimizer(env):
break break
insert_inplace_optimizer = gof.optimizer(_insert_inplace_optimizer) insert_inplace_optimizer = gof.optimizer(_insert_inplace_optimizer)
inplace_optimizer = gof.InplaceOptimizer(
gof.SeqOptimizer(out2in(gemm_pattern_1),
insert_inplace_optimizer,
failure_callback = gof.warn))
compile.optdb.register('inplace_opt', inplace_optimizer, 99, 'fast_run', 'inplace')
def register_canonicalize(lopt, *tags, **kwargs): def register_canonicalize(lopt, *tags, **kwargs):
name = (kwargs and kwargs.pop('name')) or lopt.__name__ name = (kwargs and kwargs.pop('name')) or lopt.__name__
...@@ -216,6 +189,13 @@ register_canonicalize(local_shape_lift_dot) ...@@ -216,6 +189,13 @@ register_canonicalize(local_shape_lift_dot)
################ ################
def encompasses_broadcastable(b1, b2): def encompasses_broadcastable(b1, b2):
"""
Returns True if the broadcastable patterns b1 and b2 are such that b2 is
broadcasted to b1's shape and not the opposite.
:param b1: the broadcastable attribute of a tensor type
:param b2: the broadcastable attribute of a tensor type
"""
if len(b1) < len(b2): if len(b1) < len(b2):
return False return False
b1 = b1[-len(b2):] b1 = b1[-len(b2):]
...@@ -330,6 +310,7 @@ def local_fill_cut(node): ...@@ -330,6 +310,7 @@ def local_fill_cut(node):
register_canonicalize(local_fill_cut) register_canonicalize(local_fill_cut)
register_canonicalize(gof.OpRemove(T.tensor_copy), name='remove_tensor_copy' )
@gof.local_optimizer([None, T.fill]) @gof.local_optimizer([None, T.fill])
def local_fill_sink(node): def local_fill_sink(node):
...@@ -524,9 +505,30 @@ class Canonizer(gof.LocalOptimizer): ...@@ -524,9 +505,30 @@ class Canonizer(gof.LocalOptimizer):
return False return False
new = self.merge_num_denum(num, denum) new = self.merge_num_denum(num, denum)
if new.type != out.type: if new.dtype != out.dtype:
#new = T.fill(out, new) #new = T.fill(out, new)
new = T.fill(out, T.Elemwise(scalar.Identity(scalar.specific_out(getattr(scalar, out.type.dtype))))(new)) elem_op = T.Elemwise(scalar.Identity(scalar.specific_out(getattr(scalar, out.type.dtype))))
new = T.fill(out, elem_op(new))
if new.broadcastable != out.broadcastable:
#this case is tricky... we need to provide exactly the same kind of broadcastable
#pattern, but only if legal...
dlen = len(new.broadcastable) - len(out.broadcastable)
if dlen > 0:
#try to take the leading ranks of new.broadcastable, which should be broadcastable
# ranks
#if this means skipping over nonbroadcastable ranks, then DimShuffle will fail
dimshuffle_op = T.DimShuffle(new.broadcastable,
range(dlen, len(new.broadcastable)))
new = dimshuffle_op(new)
elif dlen < 0:
#we have to boost up a scalar or something
dimshuffle_op = T.DimShuffle(new.broadcastable,
['x' for x in range(-dlen)] + range(0, len(new.broadcastable)))
new = dimshuffle_op(new)
# if our if's above worked, this should be true. OTW investigate.
assert new.type == out.type
return [new] return [new]
def __str__(self): def __str__(self):
...@@ -550,6 +552,7 @@ def local_neg_to_mul(node): ...@@ -550,6 +552,7 @@ def local_neg_to_mul(node):
return [-1 * node.inputs[0]] return [-1 * node.inputs[0]]
else: else:
return False return False
register_canonicalize(local_neg_to_mul)
@gof.local_optimizer([T.mul]) @gof.local_optimizer([T.mul])
def local_mul_to_neg(node): def local_mul_to_neg(node):
...@@ -557,6 +560,7 @@ def local_mul_to_neg(node): ...@@ -557,6 +560,7 @@ def local_mul_to_neg(node):
return [-local_mul_canonizer.merge_num_denum(node.inputs[1:], [])] return [-local_mul_canonizer.merge_num_denum(node.inputs[1:], [])]
else: else:
return False return False
register_specialize(local_mul_to_neg)
@gof.local_optimizer([T.div]) @gof.local_optimizer([T.div])
def local_div_to_inv(node): def local_div_to_inv(node):
...@@ -564,10 +568,120 @@ def local_div_to_inv(node): ...@@ -564,10 +568,120 @@ def local_div_to_inv(node):
return [T.inv(local_mul_canonizer.merge_num_denum(node.inputs[1:], []))] return [T.inv(local_mul_canonizer.merge_num_denum(node.inputs[1:], []))]
else: else:
return False return False
register_canonicalize(local_neg_to_mul)
register_specialize(local_mul_to_neg)
register_specialize(local_div_to_inv) register_specialize(local_div_to_inv)
@gof.local_optimizer([T.inv])
def local_inv_canon(node):
if node.op == T.inv:
return [T.pow(node.inputs[0], -1.0)]
else:
return False
register_canonicalize(local_inv_canon)
@gof.local_optimizer([T.pow])
def local_pow_canonicalize(node):
if node.op == T.pow:
if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 1.0):
return [T.fill(node.inputs[1], node.inputs[0])]
if N.all(local_mul_canonizer.get_constant(node.inputs[1]) == 0.0):
#extra fills here are to make sure the size of the output stays constant.
return [T.fill(node.inputs[0], T.fill(node.inputs[1], 1.0))]
else:
return False
register_canonicalize(local_pow_canonicalize)
@gof.local_optimizer([T.pow])
def local_pow_specialize(node):
#here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
if node.op == T.pow:
#the idea here is that we have pow(x, y)
xsym = node.inputs[0]
ysym = node.inputs[1]
y = local_mul_canonizer.get_constant(ysym)
if (y is not None) \
and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
if N.all(y == 2.0):
return [T.sqr(xsym)]
if N.all(y == 1.0):
return [xsym]
if N.all(y == 0.0):
return [T.fill(xsym, 1.0)]
if N.all(y == 0.5):
return [T.sqrt(xsym)]
if N.all(y == -0.5):
return [T.inv(T.sqrt(xsym))]
if N.all(y == -1.0):
return [T.inv(xsym)]
if N.all(y == -2.0):
return [T.inv(T.sqr(xsym))]
else:
return False
register_specialize(local_pow_specialize)
@gof.local_optimizer([T.mul])
def local_mul_specialize(node):
#here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
if node.op == T.mul:
#the idea here is that we have pow(x, y)
neg = False
new_inputs = []
for input in node.inputs:
y = local_mul_canonizer.get_constant(input)
if N.all(y == 1.0):
continue
elif N.all(y == -1.0):
neg ^= True #toggles
elif N.all(y == 0.0):
return [input]
else:
new_inputs.append(input)
if len(new_inputs) < len(node.inputs):
if len(new_inputs) == 0:
newval = -y.flatten()[0] if neg else y.flatten()[0]
return [T.TensorConstant(T.Tensor(dtype=node.outputs[0].type.dtype,
broadcastable = [True] * node.outputs[0].ndim), N.asarray(newval))]
if len(new_inputs) == 1:
return [-new_inputs[0]] if neg else new_inputs
else:
return [-T.mul(*new_inputs)] if neg else \
[T.mul(*new_inputs)]
else:
return False
register_specialize(local_mul_specialize)
if 0: #TODO: replace this with a c version of any InplaceDimShuffle
class _TransposeInplace(T.Op):
view_map = {0: [0]}
def make_node(self, input):
return T.Apply(self, [input],
[T.tensor(dtype = input.type.dtype,
broadcastable = reversed(input.type.broadcastable))])
def perform(self, node, (x, ), (z, )):
z[0] = x.T
def c_code(self, node, name, (x, ), (z, ), sub):
return """
PyArrayObject* transposed = (PyArrayObject*)PyArray_Transpose(%(x)s, NULL);
if (%(z)s) {
Py_XDECREF(%(z)s);
}
%(z)s = transposed;
""" % locals()
def __str__(self):
return "_TransposeInplace"
_transpose_inplace = _TransposeInplace()
@gof.local_optimizer([T.DimShuffle([False,False],[1,0],inplace=True)])
def local_dimshuffle_transposeinplace(node):
if node.op == T.DimShuffle([False,False],[1,0],inplace=True):
return [_transpose_inplace(node.inputs[0])]
return False
register_specialize(local_dimshuffle_transposeinplace)
register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer') register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')
...@@ -724,8 +838,10 @@ def constant_folding(node): ...@@ -724,8 +838,10 @@ def constant_folding(node):
register_canonicalize(constant_folding) register_canonicalize(constant_folding)
inplace_matrix_transpose = T.DimShuffle([False,False], [1,0], inplace=True)
local_transposed_dot = gof.PatternSub((inplace_matrix_transpose, (T.dot, 'x', 'y')),
(T.dot, (inplace_matrix_transpose, 'y'), (inplace_matrix_transpose, 'x')))
register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# def _math_optimizer(): # def _math_optimizer():
......
...@@ -662,56 +662,6 @@ class T_max_and_argmax(unittest.TestCase): ...@@ -662,56 +662,6 @@ class T_max_and_argmax(unittest.TestCase):
self.failUnless(i.shape == (2,3)) self.failUnless(i.shape == (2,3))
class T_transpose(unittest.TestCase):
def test0(self):
n = as_tensor(numpy.ones(()))
t = transpose(n)
self.failUnless(t.owner.op == inplace.transpose_inplace)
f = function([n], t)
tval = f(n.data)
self.failUnless(tval.shape == n.data.shape)
#test aliasing
tval += 55.0
self.failUnless(n.data == 1.0)
def test1(self):
n = as_tensor(numpy.ones(5))
t = transpose(n)
self.failUnless(t.owner.op == inplace.transpose_inplace)
f = function([n], t)
tval = f(n.data)
self.failUnless(tval.shape == n.data.shape)
#test aliasing
tval += 55.0
self.failUnless(n.data[0] == 1.0)
def test2(self):
n = as_tensor(numpy.ones((5,3)))
t = transpose(n)
self.failUnless(t.owner.op == inplace.transpose_inplace)
f = function([n], t)
tval = f(n.data)
self.failUnless(tval.shape == (3,5))
#test aliasing
tval += 55.0
self.failUnless(n.data[0,0] == 1.0)
def test3(self):
"""Test transpose of tensor, inplace version"""
n = as_tensor(numpy.ones((5,3,2)))
t = inplace.transpose_inplace(n)
self.failUnless(t.owner.op == inplace.transpose_inplace)
f = function([n], t)
tval = f(n.data)
self.failUnless(tval.shape == (2,3,5))
#test aliasing
tval += 55.0
self.failUnless(n.data[0,0,0] == 56.0)
def test_grad(self):
verify_grad(self, inplace.transpose_inplace, [numpy.random.rand(2, 3)])
verify_grad(self, inplace.transpose_inplace, [numpy.ones(3)])
class T_subtensor(unittest.TestCase): class T_subtensor(unittest.TestCase):
def setUp(self): def setUp(self):
Subtensor.debug = False Subtensor.debug = False
...@@ -1406,179 +1356,6 @@ class t_dot(unittest.TestCase): ...@@ -1406,179 +1356,6 @@ class t_dot(unittest.TestCase):
#verify_grad(self, dot, [self.rand(), self.rand(2)]) #verify_grad(self, dot, [self.rand(), self.rand(2)])
#verify_grad(self, dot, [self.rand(), self.rand(2,5)]) #verify_grad(self, dot, [self.rand(), self.rand(2,5)])
class t_gemm(unittest.TestCase):
def setUp(self):
numpy.random.seed(44)
_approx_eq.debug = 0
Gemm.debug = False
@staticmethod
def _gemm(z,a,x,y,b):
assert a.shape == ()
assert b.shape == ()
return b * z + a * numpy.dot(x,y)
@staticmethod
def rand(*args):
return numpy.random.rand(*args)
def cmp(self, z, a, x, y, b):
def cmp_linker(z, a, x, y, b, l):
z,a,x,y,b = [numpy.asarray(p) for p in z,a,x,y,b]
z_orig = z.copy()
tz,ta,tx,ty,tb = [as_tensor(p).type() for p in z,a,x,y,b]
f = function([tz,ta,tx,ty,tb], gemm(tz,ta,tx,ty,tb), mode=compile.Mode(optimizer = None, linker = l))
new_z = f(z,a,x,y,b)
z_after = self._gemm(z_orig, a, x, y, b)
self.failUnless(z is new_z)
#print z_orig, z_after, z, type(z_orig), type(z_after), type(z)
#_approx_eq.debug = 1
self.failUnless(_approx_eq(z_after, z))
if a == 0.0 and b == 1.0:
return
else:
self.failIf(numpy.all(z_orig == z))
cmp_linker(copy(z), a, x, y, b, 'c|py')
cmp_linker(copy(z), a, x, y, b, 'c')
cmp_linker(copy(z), a, x, y, b, 'py')
def test0a(self):
Gemm.debug = True
try:
g = gemm([1.], 1., [1.], [1.], 1.)
except ValueError, e:
if e[0] is Gemm.E_rank:
return
self.fail()
def test0(self):
try:
self.cmp(1., 0., 1.0, 1.0, 1.0)
except ValueError, e:
if e[0] is Gemm.E_rank:
return
self.fail()
def test2(self):
try:
self.cmp(2., 1.0, [3,2,1.], [[1],[2],[3.]], 1.0)
except ValueError, e:
self.failUnless(e[0] == Gemm.E_rank)
return
self.fail()
def test4(self):
self.cmp(self.rand(3,4), 1.0, self.rand(3,5), self.rand(5,4), 0.0)
def test5(self): self.cmp(self.rand(3,4), 1.0,
self.rand(3,5), self.rand(5,4), 1.0)
def test6(self): self.cmp(self.rand(3,4), 1.0,
self.rand(3,5), self.rand(5,4), -1.0)
def test7(self): self.cmp(self.rand(3,4), 0.0,
self.rand(3,5), self.rand(5,4), 0.0)
def test8(self): self.cmp(self.rand(3,4), 0.0,
self.rand(3,5), self.rand(5,4), 0.6)
def test9(self): self.cmp(self.rand(3,4), 0.0,
self.rand(3,5), self.rand(5,4), -1.0)
def test10(self):
_approx_eq.debug = 1
self.cmp(self.rand(3,4), -1.0, self.rand(3,5), self.rand(5,4), 0.0)
def test11(self): self.cmp(self.rand(3,4), -1.0,
self.rand(3,5), self.rand(5,4), 1.0)
def test12(self): self.cmp(self.rand(3,4), -1.0,
self.rand(3,5), self.rand(5,4), -1.0)
def test_destroy_map0(self):
"""test that only first input can be overwritten"""
Z = as_tensor(self.rand(2,2))
try:
gemm(Z, 1.0, Z, Z, 1.0)
except ValueError, e:
if e[0] == Gemm.E_z_uniq:
return
self.fail()
def test_destroy_map1(self):
"""test that only first input can be overwritten"""
Z = as_tensor(self.rand(2,2))
A = as_tensor(self.rand(2,2))
try:
gemm(Z, 1.0, A, inplace.transpose_inplace(Z), 1.0)
except ValueError, e:
if e[0] == Gemm.E_z_uniq:
return
self.fail()
def test_destroy_map2(self):
"""test that only first input can be overwritten"""
Z = as_tensor(self.rand(2,2))
A = as_tensor(self.rand(2,2))
try:
gemm(Z, 1.0, inplace.transpose_inplace(Z), A, 1.0)
except ValueError, e:
if e[0] == Gemm.E_z_uniq:
return
self.fail()
def test_destroy_map3(self):
"""test that only first input can be overwritten"""
Z = as_tensor(self.rand(2,2))
A = as_tensor(self.rand(2,2))
try:
gemm(Z, 1.0, Z, A, 1.0)
except ValueError, e:
if e[0] == Gemm.E_z_uniq:
return
self.fail()
def test_destroy_map4(self):
"""test that dot args can be aliased"""
Z = value(self.rand(2,2))
A = value(self.rand(2,2))
eval_outputs([gemm(Z, 1.0, A, A, 1.0)])
eval_outputs([gemm(Z, 1.0, A, A.T, 1.0)])
def test_transposes(self):
# three square matrices which are not contiguous
A = self.rand(4,5)[:,:4]
B = self.rand(4,5)[:,:4]
C = self.rand(4,5)[:,:4]
def t(z,x,y,a=1.0, b=0.0,l='c|py',dt='float64'):
z,a,x,y,b = [numpy.asarray(p,dtype=dt) for p in z,a,x,y,b]
z_orig = z.copy()
z_after = self._gemm(z, a, x, y, b)
tz,ta,tx,ty,tb = [value(p) for p in z,a,x,y,b]
f = function([tz,ta,tx,ty,tb], gemm(tz,ta,tx,ty,tb), mode = compile.Mode(optimizer = None, linker=l))
f(z, a, x, y, b)
self.failUnless(_approx_eq(z_after, z), (z_orig, z_after, z, z_after - z))
f(z.T, a, y.T, x.T, b)
self.failUnless(_approx_eq(z_after, z))
t(C,A,B)
t(C.T, A, B)
t(C, A.T, B, dt='float32')
t(C, A, B.T)
t(C.T, A.T, B)
t(C, A.T, B.T, dt='float32')
t(C.T, A, B.T)
t(C.T, A.T, B.T, dt='float32')
t(C, A[:,:2], B[:2, :])
t(C.T, A[:,:2], B[:2, :], dt='float32')
t(C, A[:2,:].T, B[:2, :])
t(C.T, A[:2,:].T, B[:2, :], dt='float32')
t(C, A[:2,:].T, B[:, :2].T)
t(C.T, A[:2,:].T, B[:, :2].T)
try:
t(C.T, A[:2,:], B[:, :2].T)
except ValueError, e:
if e[0].find('aligned') >= 0:
return
self.fail()
class T_tensorfromscalar(unittest.TestCase): class T_tensorfromscalar(unittest.TestCase):
def test0(self): def test0(self):
s = scal.constant(56) s = scal.constant(56)
......
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论