提交 a1197aac authored 作者: James Bergstra's avatar James Bergstra

Changes in CLazyLinker

- can update shared variables internally, so that it does not need Function.__call__ to give correct answers. - it can loop without returning to Python to implement scan (though this is not much faster than a for i in xrange(...) f.fn() type thing - it is versioned now so that it is recompiled when necessary - the scan test speed is modified to show these things.
上级 c22f3d76
......@@ -629,10 +629,11 @@ class Function(object):
# WARNING: This circumvents the 'readonly' attribute in x
o_container.storage[0] = None
# Update the inputs that have an update function
for input, storage in reversed(zip(self.maker.expanded_inputs, self.input_storage)):
if input.update is not None:
storage.data = outputs.pop()
if getattr(self.fn, 'need_update_inputs', True):
# Update the inputs that have an update function
for input, storage in reversed(zip(self.maker.expanded_inputs, self.input_storage)):
if input.update is not None:
storage.data = outputs.pop()
# Put default values back in the storage
for i, (required, refeed, value) in enumerate(self.defaults):
......@@ -995,6 +996,9 @@ class FunctionMaker(object):
else:
self.linker = linker.accept(env)
#hacky thing so VMLinker
self.linker.expanded_inputs = expanded_inputs
self.indices = indices
self.inputs = inputs
self.expanded_inputs = expanded_inputs
......
......@@ -35,6 +35,37 @@ static double pytime(const struct timeval * tv)
return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
}
/**
Helper routine to convert a PyList of integers to a c array of integers.
*/
static int unpack_list_of_ssize_t(PyObject * pylist, Py_ssize_t **dst, Py_ssize_t *len,
const char* kwname)
{
Py_ssize_t buflen, *buf;
if (!PyList_Check(pylist))
{
PyErr_Format(PyExc_TypeError, "%s must be list", kwname);
return -1;
}
assert (NULL == *dst);
*len = buflen = PyList_Size(pylist);
*dst = buf = (Py_ssize_t*)malloc(buflen * sizeof(Py_ssize_t));
assert(buf);
for (int ii = 0; ii < buflen; ++ii)
{
PyObject * el_i = PyList_GetItem(pylist, ii);
Py_ssize_t n_i = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred())
{
free(buf);
*dst = NULL;
return -1;
}
buf[ii] = n_i;
}
return 0;
}
/**
CLazyLinker
......@@ -52,6 +83,7 @@ typedef struct {
int n_vars; // number of variables in the graph
int * var_computed; // 1 or 0 for every variable
PyObject ** var_computed_cells;
PyObject ** var_value_cells;
Py_ssize_t n_output_vars;
Py_ssize_t * output_vars; // variables that *must* be evaluated by call
......@@ -69,13 +101,15 @@ typedef struct {
Py_ssize_t * node_n_prereqs;
Py_ssize_t ** node_prereqs;
Py_ssize_t * update_storage; // dst0, src0, dst1, src1, ... cells to switch after a call
Py_ssize_t n_updates;
void ** thunk_cptr_fn;
void ** thunk_cptr_data;
PyObject * call_times;
PyObject * call_counts;
int do_timing;
int need_update_inputs;
int position_of_error; // -1 for no error, otw the index into `thunks` that failed.
} CLazyLinker;
......@@ -89,6 +123,8 @@ CLazyLinker_dealloc(PyObject* _self)
free(self->is_lazy);
free(self->update_storage);
if (self->node_n_prereqs)
{
for (int i = 0; i < self->n_applies; ++i)
......@@ -112,9 +148,11 @@ CLazyLinker_dealloc(PyObject* _self)
for (int i = 0; i < self->n_vars; ++i)
{
Py_DECREF(self->var_computed_cells[i]);
Py_DECREF(self->var_value_cells[i]);
}
}
free(self->var_computed_cells);
free(self->var_value_cells);
free(self->output_vars);
Py_XDECREF(self->nodes);
......@@ -140,6 +178,7 @@ CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->n_vars = 0;
self->var_computed = NULL;
self->var_computed_cells = NULL;
self->var_value_cells = NULL;
self->n_output_vars = 0;
self->output_vars = NULL;
......@@ -157,12 +196,16 @@ CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->node_prereqs = NULL;
self->node_n_prereqs = NULL;
self->update_storage = NULL;
self->n_updates = 0;
self->thunk_cptr_data = NULL;
self->thunk_cptr_fn = NULL;
self->call_times = NULL;
self->call_counts = NULL;
self->do_timing = 0;
self->need_update_inputs = 0;
self->position_of_error = -1;
}
return (PyObject *)self;
......@@ -179,6 +222,7 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
(char*)"call_counts",
(char*)"call_times",
(char*)"compute_map_list",
(char*)"storage_map_list",
(char*)"base_input_output_list",
(char*)"node_n_inputs",
(char*)"node_n_outputs",
......@@ -189,9 +233,11 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
(char*)"output_vars",
(char*)"node_prereqs",
(char*)"node_output_size",
(char*)"update_storage",
NULL};
PyObject *compute_map_list=NULL,
*storage_map_list=NULL,
*base_input_output_list=NULL,
*node_n_inputs=NULL,
*node_n_outputs=NULL,
......@@ -201,10 +247,11 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
*is_lazy=NULL,
*output_vars=NULL,
*node_prereqs=NULL,
*node_output_size=NULL;
*node_output_size=NULL,
*update_storage=NULL;
assert(!self->nodes);
if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOO", kwlist,
if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOOOO", kwlist,
&self->nodes,
&self->thunks,
&self->pre_call_clear,
......@@ -212,6 +259,7 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
&self->call_counts,
&self->call_times,
&compute_map_list,
&storage_map_list,
&base_input_output_list,
&node_n_inputs,
&node_n_outputs,
......@@ -221,7 +269,8 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
&is_lazy,
&output_vars,
&node_prereqs,
&node_output_size
&node_output_size,
&update_storage
))
return -1;
Py_INCREF(self->nodes);
......@@ -361,6 +410,7 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
self->var_has_owner = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
self->var_value_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
for (int i = 0; i < self->n_vars; ++i)
{
PyObject * el_i = PyList_GetItem(var_owner, i);
......@@ -378,6 +428,8 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
}
self->var_computed_cells[i] = PyList_GetItem(compute_map_list, i);
Py_INCREF(self->var_computed_cells[i]);
self->var_value_cells[i] = PyList_GetItem(storage_map_list, i);
Py_INCREF(self->var_value_cells[i]);
}
}
else
......@@ -386,26 +438,18 @@ CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
return -1;
}
//output vars
if (PyList_Check(output_vars))
if (unpack_list_of_ssize_t(output_vars, &self->output_vars, &self->n_output_vars,
"output_vars"))
return -1;
for (int i = 0; i < self->n_output_vars; ++i)
{
self->n_output_vars = PyList_Size(output_vars);
self->output_vars = (Py_ssize_t*)malloc(self->n_output_vars*sizeof(Py_ssize_t));
assert(self->output_vars);
for (int i = 0; i < self->n_output_vars; ++i)
{
PyObject * el_i = PyList_GetItem(output_vars, i);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= self->n_vars);
self->output_vars[i] = N;
}
}
else
{
PyErr_SetString(PyExc_TypeError, "output_vars must be list");
return -1;
assert(self->output_vars[i] < self->n_vars);
}
if (unpack_list_of_ssize_t(update_storage, &self->update_storage, &self->n_updates,
"updates_storage"))
return -1;
assert((self->n_updates % 2) == 0);
self->n_updates /= 2;
return 0;
}
static void set_position_of_error(CLazyLinker * self, int owner_idx)
......@@ -719,6 +763,17 @@ CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
{
err = lazy_rec_eval(self, self->output_vars[i], one, zero);
}
for (int i = 0; i < self->n_updates; ++i)
{
Py_ssize_t dst = self->update_storage[2*i];
Py_ssize_t src = self->update_storage[2*i+1];
PyObject* tmp = PyList_GetItem(self->var_value_cells[src], 0);
Py_INCREF(Py_None);
Py_INCREF(tmp);
PyList_SetItem(self->var_value_cells[dst], 0, tmp);
PyList_SetItem(self->var_value_cells[src], 0, Py_None);
}
}
Py_DECREF(one);
Py_DECREF(zero);
......@@ -749,6 +804,8 @@ static PyMemberDef CLazyLinker_members[] = {
(char*)"position of failed thunk"},
{(char*)"time_thunks", T_INT, offsetof(CLazyLinker, do_timing), 0,
(char*)"bool: nonzero means call will time thunks"},
{(char*)"need_update_inputs", T_INT, offsetof(CLazyLinker, need_update_inputs), 0,
(char*)"bool: nonzero means Function.__call__ must implement update mechanism"},
{NULL} /* Sentinel */
};
......@@ -794,8 +851,15 @@ static PyTypeObject lazylinker_ext_CLazyLinkerType = {
CLazyLinker_new, /* tp_new */
};
static PyObject * get_version(PyObject *dummy, PyObject *args)
{
PyObject *result = PyFloat_FromDouble(0.1);
return result;
}
static PyMethodDef lazylinker_ext_methods[] = {
{NULL} /* Sentinel */
{"get_version", get_version, METH_VARARGS, "Get extension version."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
......
......@@ -10,8 +10,16 @@ from theano.gof import cmodule
if config.compiledir not in sys.path:
sys.path.append(config.compiledir)
version = 0.1 # must match constant returned in function get_version()
try:
import lazylinker_ext
try:
import lazylinker_ext.lazylinker_ext
get_version = lazylinker_ext.lazylinker_ext.get_version
except:
get_version = lambda: None
if version != get_version():
raise ImportError()
except ImportError:
get_lock()
try:
......@@ -19,7 +27,15 @@ except ImportError:
# waiting for the lock?
try:
import lazylinker_ext
try:
import lazylinker_ext.lazylinker_ext
get_version = lazylinker_ext.lazylinker_ext.get_version
except:
get_version = lambda: None
if version != get_version():
raise ImportError()
except ImportError:
print "COMPILING NEW CVM"
dirname = 'lazylinker_ext'
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
code = open(cfile).read()
......@@ -27,8 +43,10 @@ except ImportError:
if not os.path.exists(loc):
os.mkdir(loc)
cmodule.gcc_module_compile_str(dirname, code, location=loc)
print "NEW VERSION", lazylinker_ext.lazylinker_ext.get_version()
finally:
# Release lock on compilation directory.
release_lock()
from lazylinker_ext.lazylinker_ext import *
assert version == get_version()
......@@ -50,6 +50,10 @@ class VM(object):
of runtime spent on thunks[i] in the course of computations performed by
call_with_timers().
need_update_inputs - bool. True indicates that Function.__call__ must
implement the feedback from output storage to input storage. False means
it *must not* repeat that feedback.
"""
def __init__(self, nodes, thunks, pre_call_clear):
"""
......@@ -70,6 +74,7 @@ class VM(object):
self.call_counts = [0]*len(nodes)
self.call_times = [0]*len(nodes)
self.time_thunks = False
self.need_update_inputs = True
def __call__(self):
"""
......@@ -395,7 +400,8 @@ class VM_Linker(link.LocalLinker):
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
compute_map,
updated_vars
):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
......@@ -467,6 +473,12 @@ class VM_Linker(link.LocalLinker):
prereq_var_idxs.sort() # TODO: why sort?
node_prereqs.append(prereq_var_idxs)
update_storage = []
for (ivar, ovar) in updated_vars.items():
if ivar != ovar:
update_storage.append(vars_idx[ivar]) #dst
update_storage.append(vars_idx[ovar]) #src
c0 = sys.getrefcount(node_n_inputs)
vm = CVM(
nodes,
......@@ -476,6 +488,7 @@ class VM_Linker(link.LocalLinker):
call_counts=[0]*len(nodes),
call_times=[0.0]*len(nodes),
compute_map_list=compute_map_list,
storage_map_list=storage_map_list,
base_input_output_list=base_input_output_list,
node_n_inputs=node_n_inputs,
node_n_outputs=node_n_outputs,
......@@ -486,6 +499,7 @@ class VM_Linker(link.LocalLinker):
output_vars=output_vars,
node_prereqs=node_prereqs,
node_output_size=node_output_size,
update_storage=update_storage,
)
assert c0 == sys.getrefcount(node_n_inputs)
else:
......@@ -510,7 +524,10 @@ class VM_Linker(link.LocalLinker):
)
return vm
def make_all(self, profiler = None, input_storage = None, output_storage = None):
def make_all(self, profiler = None, input_storage = None,
output_storage = None,
):
expanded_inputs=self.expanded_inputs # hacky argumentpassing workaround
env = self.env
order = list(env.toposort())
no_recycling = self.no_recycling
......@@ -541,12 +558,24 @@ class VM_Linker(link.LocalLinker):
else:
post_thunk_clear = None
# calculate the update_storage map whose keys are shared var inputs
# and whose values are the outputs that hold their updates
updated_vars = {}
if expanded_inputs:
# Update the inputs that have an update function
potential_values = list(env.outputs)
assert len(expanded_inputs)==len(env.inputs)
for e_input, ivar in reversed(zip(expanded_inputs, env.inputs)):
if e_input.update is not None:
updated_vars[ivar] = potential_values.pop()
vm = self.make_vm(order, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
compute_map,
updated_vars
)
return (vm,
......
......@@ -2151,7 +2151,7 @@ def test_speed():
t1 = time.time()
print 'python with builtin iterator', t1 - t0
if 0:
if 1:
r = numpy.arange(10000).astype(theano.config.floatX).reshape(1000,10)
s_r = tensor.matrix()
s_y, updates = theano.scan(fn=lambda ri, rii:ri+rii,
......@@ -2188,27 +2188,50 @@ def test_speed():
print shared_r.get_value()
def test_speed_rnn():
import theano.scalar.sharedvar
print """Warning: the updates version runs slower than python because by
default the blas optimizations don't replace dot with dot22. Why is that?"""
L = 10000
N = 50
N=20
numpy.random.seed(2523452)
r = numpy.arange(1000*N).astype(theano.config.floatX).reshape(1000,N)
r = numpy.arange(L*N).astype(theano.config.floatX).reshape(L,N)
w = numpy.random.randn(N,N).astype(theano.config.floatX)
t0 = time.time()
for i in xrange(1,1000):
r[i] = numpy.tanh(numpy.dot(r[i-1], w))
for i in xrange(1,L):
r[i] += numpy.tanh(numpy.dot(r[i-1], w))
t1 = time.time()
print 'python', t1 - t0
if 1:
r = numpy.arange(1000*N).astype(theano.config.floatX).reshape(1000,N)
r = numpy.arange(L*N).astype(theano.config.floatX).reshape(L,N)
s_r = tensor.matrix()
s_y, updates = theano.scan(fn=lambda ri, rii:ri+tensor.tanh(tensor.dot(rii, w)),
sequences=[s_r[1:]],
outputs_info=tensor.constant(r[0]))
assert not updates
f = theano.function([s_r], s_y)
t2 = time.time()
f(r)
t3 = time.time()
print 'theano1', t3 - t2
if 1:
r = numpy.arange(L*N).astype(theano.config.floatX).reshape(L,N)
s_w = theano.shared(w)
shared_r = theano.shared(r)
s_i = tensor.shared(numpy.array(1))
s_rinc = tensor.set_subtensor(
s_i = theano.scalar.sharedvar.shared(1)
s_rinc = tensor.inc_subtensor(
shared_r[s_i],
theano.tensor.tanh(theano.dot(shared_r[s_i-1], s_w)),
theano.tensor.tanh(
theano.tensor.dot(
shared_r[s_i-1],
w)),
tolerate_inplace_aliasing=True)
f = theano.function([], [],
updates={
......@@ -2216,15 +2239,14 @@ def test_speed_rnn():
shared_r: s_rinc,
})
theano.printing.debugprint(f )
f._check_for_aliased_inputs = False
f_fn = f.fn
print type(f_fn)
#print help(f_fn)
t2 = time.time()
f_fn(n_calls=998)
#for i in xrange(998):
#f_fn()
#f()
if 1:
f_fn(n_calls=L-2)
elif 0:
for i in xrange(L-2): f_fn()
else:
for i in xrange(L-2): f()
f() #999 to update the profiling timers
t3 = time.time()
print 'theano2', t3 - t2
......
......@@ -37,6 +37,10 @@ def tensor_constructor(value, name=None, strict=False, allow_downcast=None, borr
# TensorSharedVariable brings in the tensor operators, is not ideal, but works
# as long as we dont do purely scalar-scalar operations
# _tensor_py_operators is first to have its version of __{gt,ge,lt,le}__
#
# N.B. THERE IS ANOTHER CLASS CALLED ScalarSharedVariable in the
# theano.scalar.sharedvar file. It is not registered as a shared_constructor,
# this one is.
class ScalarSharedVariable(_tensor_py_operators, SharedVariable):
pass
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论