提交 9433d5d2 authored 作者: James Bergstra's avatar James Bergstra

vm linker

上级 2677e15f
......@@ -331,6 +331,7 @@ class Function(object):
self.unpack_single = unpack_single
self.return_none = return_none
self.maker = maker
self.profile = None # reassigned in FunctionMaker.create
# We will be popping stuff off this `containers` object. It is a copy.
containers = list(self.input_storage)
......
......@@ -4,7 +4,9 @@ import os, logging
import numpy, theano
from theano import gof
from theano.configparser import config, AddConfigVar, StrParam
import theano.gof.vm
from theano.configparser import config, AddConfigVar, StrParam, EnumStr
_logger = logging.getLogger('theano.compile.mode')
......@@ -55,7 +57,11 @@ predefined_linkers = {
'c' : gof.CLinker(),
'c|py' : gof.OpWiseCLinker(allow_gc=True),
'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False),
'c&py' : gof.DualLinker(checker = check_equal)
'c&py' : gof.DualLinker(checker = check_equal),
'vm' : gof.vm.VM_Linker(allow_gc=True, use_cloop=False),
'cvm' : gof.vm.VM_Linker(allow_gc=True, use_cloop=True),
'vm_nogc' : gof.vm.VM_Linker(allow_gc=False, use_cloop=False),
'cvm_nogc': gof.vm.VM_Linker(allow_gc=False, use_cloop=True),
}
......@@ -249,6 +255,7 @@ class Mode(object):
self._optimizer = optimizer
self.call_time = 0
self.fn_time = 0
linker.mode = self #TODO: WHY IS THIS HERE?
self.optimizer_time = 0
self.linker_time = 0
......@@ -290,15 +297,27 @@ class Mode(object):
FAST_COMPILE = Mode('py', 'fast_compile')
FAST_RUN = Mode('c|py', 'fast_run')
FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run')
SANITY_CHECK = [Mode('c|py', None),
Mode('c|py', 'fast_run')]
STABILIZE = Mode("c|py", OPT_STABILIZE)
predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
'FAST_RUN': FAST_RUN,
'FAST_RUN_NOGC':FAST_RUN_NOGC,
'SANITY_CHECK': SANITY_CHECK,
'STABILIZE': STABILIZE}
'STABILIZE': STABILIZE,
'VM':Mode('vm', 'fast_run'),
'VM_NOGC':Mode('vm_nogc', 'fast_run'),
'CVM':Mode('cvm', 'fast_run'),
'CVM_NOGC':Mode('cvm_nogc', 'fast_run'),
}
#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
#The old all capital letter way of working is deprecated as it is not scalable.
AddConfigVar('mode',
"Default compilation mode",
EnumStr(*(predefined_modes.keys() + [
'Mode','DEBUG_MODE', 'PROFILE_MODE'])),
in_c_key=False)
instanciated_default_mode=None
def get_mode(orig_string):
......@@ -329,7 +348,7 @@ def get_mode(orig_string):
ret = DebugMode(optimizer=config.optimizer)
else:
# The import is needed in case string is ProfileMode
from profilemode import ProfileMode
from profilemode import ProfileMode,prof_mode_instance_to_print
ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)')
elif predefined_modes.has_key(string):
ret = predefined_modes[string]
......@@ -349,7 +368,6 @@ def get_mode(orig_string):
#must tell python to print the summary at the end.
if string == 'ProfileMode':
#need to import later to break circular dependency.
from profilemode import prof_mode_instance_to_print
prof_mode_instance_to_print.append(ret)
return ret
......@@ -365,3 +383,4 @@ def register_mode(name, mode):
if name in predefined_modes:
raise ValueError('Mode name already taken: %s' % name)
predefined_modes[name] = mode
......@@ -10,6 +10,8 @@ import random
import numpy.random
from theano.tests import unittest_tools as utt
import theano.tensor as T
class T_bunch_of_modes(unittest.TestCase):
......
......@@ -65,15 +65,6 @@ AddConfigVar('force_device',
BoolParam(False, allow_override=False),
in_c_key=False)
#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
#The old all capital letter way of working is deprecated as it is not scalable.
AddConfigVar('mode',
"Default compilation mode",
EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
in_c_key=False)
# Test whether or not gcc is present: disable C code if it is not.
# Using the dummy file descriptor below is a workaround for a crash experienced
# in an unusual Python 2.4.4 Windows environment with the default stdin=None.
......@@ -84,13 +75,15 @@ try:
# Keep the default linker the same as the one for the mode FAST_RUN
AddConfigVar('linker',
"Default linker used if the theano flags mode is Mode or ProfileMode",
EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
in_c_key=False)
except OSError:
# gcc is not present, linker should default to python only
AddConfigVar('linker',
"Default linker used if the theano flags mode is Mode or ProfileMode",
EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
in_c_key=False)
warning('GCC not detected ! Theano will be unable to execute optimized '+
'C-implementations (for both CPU and GPU) and will default to '+
......
#include <Python.h>
#include "structmember.h"
/**
TODO:
- Check max supported depth of recursion
- CLazyLinker should add context information to errors caught during evaluation. Say what node we were on, add the traceback attached to the node.
- Clear containers of fully-useed intermediate results if allow_gc is 1
- Add timers for profiling
- Add support for profiling space used.
*/
#include <time.h>
static double pytime(const struct timeval * tv)
{
struct timeval t;
if (!tv)
{
tv = &t;
gettimeofday(&t, NULL);
}
return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
}
/**
CLazyLinker
*/
typedef struct {
PyObject_HEAD
/* Type-specific fields go here. */
PyObject * nodes; // the python list of nodes
PyObject * thunks; // python list of thunks
PyObject * pre_call_clear; //list of cells to clear on call.
int allow_gc;
Py_ssize_t n_applies;
int n_vars; // number of variables in the graph
int * var_computed; // 1 or 0 for every variable
PyObject ** var_computed_cells;
Py_ssize_t n_output_vars;
Py_ssize_t * output_vars; // variables that *must* be evaluated by call
int * is_lazy; // 1 or 0 for every thunk
Py_ssize_t * var_owner; // nodes[[var_owner[var_idx]]] is var[var_idx]->owner
int * var_has_owner; // 1 or 0
Py_ssize_t * node_n_inputs;
Py_ssize_t * node_n_outputs;
Py_ssize_t ** node_inputs;
Py_ssize_t ** node_outputs;
Py_ssize_t * node_inputs_outputs_base; // node_inputs and node_outputs point into this
Py_ssize_t * node_n_prereqs;
Py_ssize_t ** node_prereqs;
void ** thunk_cptr_fn;
void ** thunk_cptr_data;
PyObject * call_times;
PyObject * call_counts;
int do_timing;
int position_of_error; // -1 for no error, otw the index into `thunks` that failed.
} CLazyLinker;
static void
CLazyLinker_dealloc(PyObject* _self)
{
CLazyLinker* self = (CLazyLinker *) _self;
free(self->thunk_cptr_fn);
free(self->thunk_cptr_data);
free(self->is_lazy);
if (self->node_n_prereqs)
{
for (int i = 0; i < self->n_applies; ++i)
{
free(self->node_prereqs[i]);
}
}
free(self->node_n_prereqs);
free(self->node_prereqs);
free(self->node_inputs_outputs_base);
free(self->node_n_inputs);
free(self->node_n_outputs);
free(self->node_inputs);
free(self->node_outputs);
free(self->var_owner);
free(self->var_has_owner);
free(self->var_computed);
if (self->var_computed_cells)
{
for (int i = 0; i < self->n_vars; ++i)
{
Py_DECREF(self->var_computed_cells[i]);
}
}
free(self->var_computed_cells);
free(self->output_vars);
Py_XDECREF(self->nodes);
Py_XDECREF(self->thunks);
Py_XDECREF(self->call_times);
Py_XDECREF(self->call_counts);
Py_XDECREF(self->pre_call_clear);
self->ob_type->tp_free((PyObject*)self);
}
static PyObject *
CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
CLazyLinker *self;
self = (CLazyLinker *)type->tp_alloc(type, 0);
if (self != NULL) {
self->nodes = NULL;
self->thunks = NULL;
self->pre_call_clear = NULL;
self->allow_gc = 1;
self->n_applies = 0;
self->n_vars = 0;
self->var_computed = NULL;
self->var_computed_cells = NULL;
self->n_output_vars = 0;
self->output_vars = NULL;
self->is_lazy = NULL;
self->var_owner = NULL;
self->var_has_owner = NULL;
self->node_n_inputs = NULL;
self->node_n_outputs = NULL;
self->node_inputs = NULL;
self->node_outputs = NULL;
self->node_inputs_outputs_base = NULL;
self->node_prereqs = NULL;
self->node_n_prereqs = NULL;
self->thunk_cptr_data = NULL;
self->thunk_cptr_fn = NULL;
self->call_times = NULL;
self->call_counts = NULL;
self->do_timing = 0;
self->position_of_error = -1;
}
return (PyObject *)self;
}
static int
CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
{
static char *kwlist[] = {
(char*)"nodes",
(char*)"thunks",
(char*)"pre_call_clear",
(char*)"allow_gc",
(char*)"call_counts",
(char*)"call_times",
(char*)"compute_map_list",
(char*)"base_input_output_list",
(char*)"node_n_inputs",
(char*)"node_n_outputs",
(char*)"node_input_offset",
(char*)"node_output_offset",
(char*)"var_owner",
(char*)"is_lazy_list",
(char*)"output_vars",
(char*)"node_prereqs",
(char*)"node_output_size",
NULL};
PyObject *compute_map_list=NULL,
*base_input_output_list=NULL,
*node_n_inputs=NULL,
*node_n_outputs=NULL,
*node_input_offset=NULL,
*node_output_offset=NULL,
*var_owner=NULL,
*is_lazy=NULL,
*output_vars=NULL,
*node_prereqs=NULL,
*node_output_size=NULL;
assert(!self->nodes);
if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOO", kwlist,
&self->nodes,
&self->thunks,
&self->pre_call_clear,
&self->allow_gc,
&self->call_counts,
&self->call_times,
&compute_map_list,
&base_input_output_list,
&node_n_inputs,
&node_n_outputs,
&node_input_offset,
&node_output_offset,
&var_owner,
&is_lazy,
&output_vars,
&node_prereqs,
&node_output_size
))
return -1;
Py_INCREF(self->nodes);
Py_INCREF(self->thunks);
Py_INCREF(self->pre_call_clear);
Py_INCREF(self->call_counts);
Py_INCREF(self->call_times);
Py_ssize_t n_applies = PyList_Size(self->nodes);
self->n_applies = n_applies;
self->n_vars = PyList_Size(var_owner);
if (PyList_Size(self->thunks) != n_applies) return -1;
if (PyList_Size(self->call_counts) != n_applies) return -1;
if (PyList_Size(self->call_times) != n_applies) return -1;
// allocated and initialize thunk_cptr_data and thunk_cptr_fn
if (n_applies)
{
self->thunk_cptr_data = (void**)malloc(n_applies * sizeof(void*));
self->thunk_cptr_fn = (void**)malloc(n_applies * sizeof(void*));
self->is_lazy = (int*)malloc(n_applies * sizeof(int));
self->node_prereqs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
self->node_n_prereqs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_prereqs);
assert(self->node_n_prereqs);
assert(self->is_lazy);
assert(self->thunk_cptr_fn);
assert(self->thunk_cptr_data);
// init these basic arrays
for (int i = 0; i < n_applies; ++i)
{
self->thunk_cptr_data[i] = NULL;
self->thunk_cptr_fn[i] = NULL;
self->is_lazy[i] = 1;
self->node_prereqs[i] = NULL;
self->node_n_prereqs[i] = 0;
}
for (int i = 0; i < n_applies; ++i)
{
PyObject * thunk = PyList_GetItem(self->thunks, i);
//thunk is borrowed
if (PyObject_HasAttrString(thunk, "cthunk"))
{
PyObject * cthunk = PyObject_GetAttrString(thunk, "cthunk");
//new reference
assert (cthunk && PyCObject_Check(cthunk));
self->thunk_cptr_fn[i] = PyCObject_AsVoidPtr(cthunk);
self->thunk_cptr_data[i] = PyCObject_GetDesc(cthunk);
Py_DECREF(cthunk);
// cthunk is kept alive by membership in self->thunks
}
else
{
self->thunk_cptr_fn[i] = NULL;
self->thunk_cptr_data[i] = NULL;
}
PyObject * el_i = PyList_GetItem(is_lazy, i);
self->is_lazy[i] = PyNumber_AsSsize_t(el_i, NULL);
/* now get the prereqs */
el_i = PyList_GetItem(node_prereqs, i);
assert (PyList_Check(el_i));
self->node_n_prereqs[i] = PyList_Size(el_i);
if (self->node_n_prereqs[i])
{
self->node_prereqs[i] = (Py_ssize_t*)malloc(
PyList_Size(el_i)*sizeof(Py_ssize_t));
for (int j = 0; j < PyList_Size(el_i); ++j)
{
PyObject * el_ij = PyList_GetItem(el_i, j);
Py_ssize_t N = PyNumber_AsSsize_t(el_ij, PyExc_IndexError);
if (PyErr_Occurred())
return -1;
// N < n. variables
assert(N < PyList_Size(var_owner));
self->node_prereqs[i][j] = N;
}
}
}
}
if (PyList_Check(base_input_output_list))
{
Py_ssize_t n_inputs_outputs_base = PyList_Size(base_input_output_list);
self->node_inputs_outputs_base = (Py_ssize_t*)malloc(n_inputs_outputs_base*sizeof(Py_ssize_t));
assert(self->node_inputs_outputs_base);
for (int i = 0; i < n_inputs_outputs_base; ++i)
{
PyObject *el_i = PyList_GetItem(base_input_output_list, i);
Py_ssize_t idx = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
self->node_inputs_outputs_base[i] = idx;
}
self->node_n_inputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_n_inputs);
self->node_n_outputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
assert(self->node_n_outputs);
self->node_inputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
assert(self->node_inputs);
self->node_outputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
assert(self->node_outputs);
for (int i = 0; i < n_applies; ++i)
{
Py_ssize_t N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_n_inputs, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_n_inputs[i] = N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_n_outputs, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_n_outputs[i] = N;
N = PyNumber_AsSsize_t(PyList_GetItem(node_input_offset, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_inputs[i] = &self->node_inputs_outputs_base[N];
N = PyNumber_AsSsize_t(PyList_GetItem(node_output_offset, i),PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_inputs_outputs_base);
self->node_outputs[i] = &self->node_inputs_outputs_base[N];
}
}
else
{
PyErr_SetString(PyExc_TypeError, "base_input_output_list must be list");
return -1;
}
// allocation for var_owner
if (PyList_Check(var_owner))
{
self->var_owner = (Py_ssize_t*)malloc(self->n_vars*sizeof(Py_ssize_t));
self->var_has_owner = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed = (int*)malloc(self->n_vars*sizeof(int));
self->var_computed_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
for (int i = 0; i < self->n_vars; ++i)
{
PyObject * el_i = PyList_GetItem(var_owner, i);
if (el_i == Py_None)
{
self->var_has_owner[i] = 0;
}
else
{
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= n_applies);
self->var_owner[i] = N;
self->var_has_owner[i] = 1;
}
self->var_computed_cells[i] = PyList_GetItem(compute_map_list, i);
Py_INCREF(self->var_computed_cells[i]);
}
}
else
{
PyErr_SetString(PyExc_TypeError, "var_owner must be list");
return -1;
}
//output vars
if (PyList_Check(output_vars))
{
self->n_output_vars = PyList_Size(output_vars);
self->output_vars = (Py_ssize_t*)malloc(self->n_output_vars*sizeof(Py_ssize_t));
assert(self->output_vars);
for (int i = 0; i < self->n_output_vars; ++i)
{
PyObject * el_i = PyList_GetItem(output_vars, i);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred()) return -1;
assert (N <= self->n_vars);
self->output_vars[i] = N;
}
}
else
{
PyErr_SetString(PyExc_TypeError, "output_vars must be list");
return -1;
}
return 0;
}
static void set_position_of_error(CLazyLinker * self, int owner_idx)
{
if (self->position_of_error == -1)
{
self->position_of_error = owner_idx;
}
}
static PyObject * pycall(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
{
// call thunk to see which inputs it wants
PyObject * thunk = PyList_GetItem(self->thunks, node_idx);
// refcounting - thunk is borrowed
PyObject * rval = NULL;
if (self->do_timing)
{
double t0 = pytime(NULL);
if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
rval = PyObject_CallObject(thunk, NULL);
double t1 = pytime(NULL);
double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
PyObject * count = PyList_GetItem(self->call_counts, node_idx);
long icount = PyInt_AsLong(count);
PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
}
else
{
if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
rval = PyObject_CallObject(thunk, NULL);
}
return rval;
}
static int c_call(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
{
void * ptr_addr = self->thunk_cptr_fn[node_idx];
int (*fn)(void*) = (int (*)(void*))(ptr_addr);
if (verbose) fprintf(stderr, "calling non-lazy shortcut (node %i)\n", (int)node_idx);
int err = 0;
if (self->do_timing)
{
double t0 = pytime(NULL);
err = fn(self->thunk_cptr_data[node_idx]);
double t1 = pytime(NULL);
double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
PyObject * count = PyList_GetItem(self->call_counts, node_idx);
long icount = PyInt_AsLong(count);
PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
}
else
{
err = fn(self->thunk_cptr_data[node_idx]);
}
if (err)
{
// cast the argument to a PyList (as described near line 226 of cc.py)
PyObject * __ERROR = ((PyObject**)self->thunk_cptr_data[node_idx])[0];
assert (PyList_Check(__ERROR));
assert (PyList_Size(__ERROR) == 3);
PyObject * err_type = PyList_GetItem(__ERROR, 0); //stolen ref
PyObject * err_msg = PyList_GetItem(__ERROR, 1); //stolen ref
PyObject * err_trace = PyList_GetItem(__ERROR, 2); //stolen ref
PyList_SET_ITEM(__ERROR, 0, Py_None); Py_INCREF(Py_None); //clobbers old ref
PyList_SET_ITEM(__ERROR, 1, Py_None); Py_INCREF(Py_None); //clobbers old ref
PyList_SET_ITEM(__ERROR, 2, Py_None); Py_INCREF(Py_None); //clobbers old ref
assert(!PyErr_Occurred()); // because CLinker hid the exception in __ERROR aka data
PyErr_Restore(err_type, err_msg, err_trace); //steals refs to args
}
if (err) set_position_of_error(self, node_idx);
return err;
}
static
int lazy_rec_eval(CLazyLinker * self, Py_ssize_t var_idx, PyObject*one, PyObject*zero)
{
int verbose = 0;
if (verbose) fprintf(stderr, "lazy_rec computing %i\n", (int)var_idx);
int err = 0;
if (self->var_computed[var_idx] || !self->var_has_owner[var_idx])
{
return 0;
}
else
{
Py_ssize_t owner_idx = self->var_owner[var_idx];
// STEP 1: compute the pre-requirements of the node
for (int i = 0; i < self->node_n_prereqs[owner_idx]; ++i)
{
Py_ssize_t prereq_idx = self->node_prereqs[owner_idx][i];
if (!self->var_computed[prereq_idx])
{
err = lazy_rec_eval(self, prereq_idx, one, zero);
if (err) return err;
}
assert (self->var_computed[prereq_idx]);
}
// STEP 2: compute the node itself
if (self->is_lazy[owner_idx])
{
// update the compute_map cells corresponding to the inputs of this thunk
for (int i = 0; i < self->node_n_inputs[owner_idx] && (!err); ++i)
{
int in_idx = self->node_inputs[owner_idx][i];
if (self->var_computed[in_idx])
{
Py_INCREF(one);
err = PyList_SetItem(self->var_computed_cells[in_idx], 0, one);
}
else
{
Py_INCREF(zero);
err = PyList_SetItem(self->var_computed_cells[in_idx], 0, zero);
}
}
if (err)
{
set_position_of_error(self, owner_idx);
return err;
}
PyObject * rval = pycall(self, owner_idx, verbose);
// refcounting - rval is new ref
//TODO: to prevent infinite loops
// - consider check that a thunk does not ask for an input that is already computed
if (rval) //call returned normally (no exception)
{
//update the computed-ness of any output cells
for (int i = 0; i < self->node_n_outputs[owner_idx]; ++i)
{
int out_idx = self->node_outputs[owner_idx][i];
PyObject * el_i = PyList_GetItem(self->var_computed_cells[out_idx], 0);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred())
{
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return -1;
}
assert (N==0 || N==1);
self->var_computed[out_idx] = N;
}
if (!self->var_computed[var_idx])
{
if (PyList_Check(rval))
{
if (PyList_Size(rval))
{
for (int i = 0; i < PyList_Size(rval) && (!err); ++i)
{
PyObject * el_i = PyList_GetItem(rval, i);
Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
if (PyErr_Occurred())
{
err = 1;
}
else
{
assert (N <= self->node_n_inputs[owner_idx]);
Py_ssize_t input_idx = self->node_inputs[owner_idx][N];
err = lazy_rec_eval(self, input_idx, one, zero);
}
}
if (!err)
err = lazy_rec_eval(self, var_idx, one, zero);
}
else
{
PyErr_SetString(PyExc_ValueError,
"lazy thunk returned empty list without computing output");
err = 1;
set_position_of_error(self, owner_idx);
}
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return err;
}
else // don't know what it returned, but it wasn't right.
{
//TODO: More helpful error to help find *which node* made this
// bad thunk
PyErr_SetString(PyExc_TypeError,
"lazy thunk should list");
Py_DECREF(rval);
set_position_of_error(self, owner_idx);
return 1;
}
}
Py_DECREF(rval);
}
else // pycall returned NULL (internal error)
{
assert (PyErr_Occurred());
set_position_of_error(self, owner_idx);
return 1;
}
}
else //owner is not a lazy op. Ensure all intputs are evaluated.
{
// loop over inputs to owner
// call lazy_rec_eval on each one that is not computed.
// if there's an error, pass it up the stack
for (int i = 0; i < self->node_n_inputs[owner_idx]; ++i)
{
Py_ssize_t input_idx = self->node_inputs[owner_idx][i];
if (!self->var_computed[input_idx])
{
err = lazy_rec_eval(self, input_idx, one, zero);
if (err) return err;
}
assert (self->var_computed[input_idx]);
}
// call the thunk for this owner.
if (self->thunk_cptr_fn[owner_idx])
{
err = c_call(self, owner_idx, verbose);
}
else
{
PyObject * rval = pycall(self, owner_idx, verbose);
//rval is new ref
if (rval) //pycall returned normally (no exception)
{
if (rval == Py_None)
{
Py_DECREF(rval); //ignore a return of None
}
else if (PyList_Check(rval))
{
PyErr_SetString(PyExc_TypeError,
"non-lazy thunk should return None, not list");
err=1;
set_position_of_error(self, owner_idx);
Py_DECREF(rval);
}
else // don't know what it returned, but it wasn't right.
{
PyErr_SetObject(PyExc_TypeError, rval);
err=1;
set_position_of_error(self, owner_idx);
}
}
else // pycall returned NULL (internal error)
{
err=1;
set_position_of_error(self, owner_idx);
}
}
}
// loop over all outputs and mark them as computed
for (int i = 0; i < self->node_n_outputs[owner_idx] && (!err); ++i)
{
self->var_computed[self->node_outputs[owner_idx][i]] = 1;
}
}
return err;
}
PyObject *
CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
{
CLazyLinker * self = (CLazyLinker*)_self;
static char *kwlist[] = {(char*)"time_thunks", NULL};
if (! PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist,
&self->do_timing))
return NULL;
int err = 0;
self->position_of_error = -1;
PyObject * one = PyInt_FromLong(1);
PyObject * zero = PyInt_FromLong(0);
//clear storage of pre_call_clear elements
Py_ssize_t n_pre_call_clear = PyList_Size(self->pre_call_clear);
assert(PyList_Check(self->pre_call_clear));
for (int i = 0; i < n_pre_call_clear; ++i)
{
PyObject * el_i = PyList_GetItem(self->pre_call_clear, i);
Py_INCREF(Py_None);
PyList_SetItem(el_i, 0, Py_None);
}
//clear the computed flag out of all non-input vars
for (int i = 0; i < self->n_vars; ++i)
{
self->var_computed[i] = !self->var_has_owner[i];
if (self->var_computed[i])
{
Py_INCREF(one);
PyList_SetItem(self->var_computed_cells[i], 0, one);
}
else
{
Py_INCREF(zero);
PyList_SetItem(self->var_computed_cells[i], 0, zero);
}
}
for (int i = 0; i < self->n_output_vars && (!err); ++i)
{
err = lazy_rec_eval(self, self->output_vars[i], one, zero);
}
Py_DECREF(one);
Py_DECREF(zero);
if (err) return NULL;
Py_INCREF(Py_None);
return Py_None;
}
#if 0
static PyMethodDef CLazyLinker_methods[] = {
{
//"name", (PyCFunction)CLazyLinker_accept, METH_VARARGS, "Return the name, combining the first and last name"
},
{NULL} /* Sentinel */
};
#endif
static PyMemberDef CLazyLinker_members[] = {
{(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,
(char*)"list of nodes"},
{(char*)"thunks", T_OBJECT_EX, offsetof(CLazyLinker, thunks), 0,
(char*)"list of thunks in program"},
{(char*)"call_counts", T_OBJECT_EX, offsetof(CLazyLinker, call_counts), 0,
(char*)"number of calls of each thunk"},
{(char*)"call_times", T_OBJECT_EX, offsetof(CLazyLinker, call_times), 0,
(char*)"total runtime in each thunk"},
{(char*)"position_of_error", T_INT, offsetof(CLazyLinker, position_of_error), 0,
(char*)"position of failed thunk"},
{(char*)"time_thunks", T_INT, offsetof(CLazyLinker, do_timing), 0,
(char*)"bool: nonzero means call will time thunks"},
{NULL} /* Sentinel */
};
static PyTypeObject lazylinker_ext_CLazyLinkerType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"lazylinker_ext.CLazyLinker", /*tp_name*/
sizeof(CLazyLinker), /*tp_basicsize*/
0, /*tp_itemsize*/
CLazyLinker_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
CLazyLinker_call, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, /*tp_flags*/
"CLazyLinker object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
0,//CLazyLinker_methods, /* tp_methods */
CLazyLinker_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)CLazyLinker_init,/* tp_init */
0, /* tp_alloc */
CLazyLinker_new, /* tp_new */
};
static PyMethodDef lazylinker_ext_methods[] = {
{NULL} /* Sentinel */
};
#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */
#define PyMODINIT_FUNC void
#endif
PyMODINIT_FUNC
initlazylinker_ext(void)
{
PyObject* m;
lazylinker_ext_CLazyLinkerType.tp_new = PyType_GenericNew;
if (PyType_Ready(&lazylinker_ext_CLazyLinkerType) < 0)
return;
m = Py_InitModule3("lazylinker_ext", lazylinker_ext_methods,
"Example module that creates an extension type.");
Py_INCREF(&lazylinker_ext_CLazyLinkerType);
PyModule_AddObject(m, "CLazyLinker", (PyObject *)&lazylinker_ext_CLazyLinkerType);
}
import os
import theano
from theano import config
from theano.gof.compilelock import get_lock, release_lock
from theano.gof import cmodule
get_lock()
try:
dirname = 'lazylinker_ext'
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
code = open(cfile).read()
loc = os.path.join(config.compiledir, dirname)
if not os.path.exists(loc):
os.mkdir(loc)
cmodule.gcc_module_compile_str(dirname, code, location=loc)
from lazylinker_ext.lazylinker_ext import *
finally:
# Release lock on compilation directory.
release_lock()
import gc
import sys
import time
import line_profiler
import numpy
from theano import function
from theano.gof import vm,link, OpWiseCLinker
from theano.compile import Mode
from theano import tensor
from theano.lazycond import cond
import theano
def test_speed():
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = (z + z)
return z
def numpy_version(x, depth):
z = x
for d in xrange(depth):
z = (z+z)
return z
def time_numpy():
steps_a = 5
steps_b = 100
x = numpy.asarray([2.0, 3.0], dtype=theano.config.floatX)
numpy_version(x, steps_a)
t0 = time.time()
print numpy_version(x, steps_a)
t1 = time.time()
t2 = time.time()
print numpy_version(x, steps_b)
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
'numpy',
(1000*(t_b-t_a) / (steps_b - steps_a)))
def time_linker(name, linker):
steps_a = 5
steps_b = 100
x = tensor.vector()
a = build_graph(x,steps_a)
b = build_graph(x,steps_b)
f_a = function([x], a,
mode=Mode(optimizer=None, linker=linker()),
#profile='f_a speed test %s'%name,
)
f_b = function([x], b,
mode=Mode(optimizer=None, linker=linker()),
#profile='f_b speed test %s'%name,
)
print f_a([2.0, 3.0])
t0 = time.time()
print f_a([2.0, 3.0])
t1 = time.time()
print f_b([2.0, 3.0])
t2 = time.time()
print f_b([2.0, 3.0])
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
name,
(1000*(t_b-t_a) / (steps_b - steps_a)))
time_linker('c|py', OpWiseCLinker)
time_linker('vmLinker', vm.VM_Linker)
time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
use_cloop=True))
time_numpy()
def test_speed_lazy():
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = cond(z> 0, -z, z)
return z
def time_linker(name, linker):
steps_a = 10
steps_b = 100
x = tensor.vector()
a = build_graph(x, steps_a)
b = build_graph(x, steps_b)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()),
#profile='f_a lazy cond %s'%name,
)
f_b = function([x], b,
mode=Mode(optimizer=None,
linker=linker()),
#profile='f_b lazy cond %s'%name,
)
print f_a([2.0])
t0 = time.time()
print f_a([2.0])
t1 = time.time()
print f_b([2.0])
t2 = time.time()
print f_b([2.0])
t3 = time.time()
t_a = t1 - t0
t_b = t3 - t2
print "%s takes %f s/Kop" % (
name,
(1000*(t_b-t_a) / (steps_b - steps_a)))
time_linker('vmLinker', vm.VM_Linker)
time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
use_cloop=True))
run_memory_usage_tests = False
if run_memory_usage_tests:
# these are not normal unit tests, do not run them as part of standard
# suite. I ran them while looking at top, and stopped when memory usage was
# stable.
def test_leak2():
import theano.sandbox.cuda as cuda
for i in xrange(1000000):
n = numpy.asarray([2.3, 4.5], dtype='f')
c = sys.getrefcount(n)
a = cuda.CudaNdarray(n)
assert c == sys.getrefcount(n)
if not i % 1000:
print '.',
print gc.collect(),
print gc.collect()
sys.stdout.flush()
def test_no_leak_many_graphs():
# Verify no memory leaks when creating and deleting a lot of functions
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
for i in xrange(10000):
x = tensor.vector()
z = x
for d in range(10):
z = tensor.sin(-z+ 1)
f = function([x], z, mode=Mode(optimizer=None, linker='cvm'))
if not i % 100:
print gc.collect()
sys.stdout.flush()
gc.collect()
if 1:
f([2.0])
f([3.0])
f([4.0])
f([5.0])
def test_no_leak_many_call_lazy():
# Verify no memory leaks when calling a function a lot of times
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = cond(z> 0, -z, z)
return z
def time_linker(name, linker):
steps_a = 10
x = tensor.vector()
a = build_graph(x, steps_a)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()))
for i in xrange(100000):
f_a([2.0])
if 0: # this doesn't seem to work, prints 0 for everything
import resource
pre = resource.getrusage(resource.RUSAGE_SELF)
post = resource.getrusage(resource.RUSAGE_SELF)
print pre.ru_ixrss, post.ru_ixrss
print pre.ru_idrss, post.ru_idrss
print pre.ru_maxrss, post.ru_maxrss
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
def test_no_leak_many_call_nonlazy():
# Verify no memory leaks when calling a function a lot of times
# This isn't really a unit test, you have to run it and look at top to see
# if there's a leak
def build_graph(x, depth=5):
z = x
for d in range(depth):
z = tensor.sin(-z+1)
return z
def time_linker(name, linker):
steps_a = 10
x = tensor.vector()
a = build_graph(x,steps_a)
f_a = function([x], a,
mode=Mode(optimizer=None,
linker=linker()))
for i in xrange(500000):
f_a([2.0])
time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
"""
VMs that run Theano graph computations.
"""
import sys
import time
import link
import traceback
from theano.gof.python25 import all
import theano
config = theano.config
from theano.configparser import config, AddConfigVar, BoolParam
from theano import config
AddConfigVar('profile',
"If VM should collect profile information",
BoolParam(False))
def raise_with_op(op, exc_info = None):
"""WRITEME"""
if exc_info is None:
exc_info = sys.exc_info()
exc_type, exc_value, exc_trace = exc_info
if exc_type == KeyboardInterrupt:
# print a simple traceback from KeyboardInterrupt
raise exc_type, exc_value, exc_trace
try:
trace = op.tag.trace
except AttributeError:
trace = ()
exc_value.__thunk_trace__ = trace
exc_value.args += (op, )
if op in op.env.toposort():
exc_value.args += ('Sequence id of Apply node='+str(op.env.toposort().index(op)),)
raise exc_type, exc_value, exc_trace
class VM(object):
"""
A VM object evaluates a Theano program with its __call__ method.
Attributes:
call_counts - list of integers, one for each thunk. call_count[i] is the
number of times thunks[i] was called in the course of computations
performed by call_with_timers().
call_times - list of floats, one for each thunk. call_times[i] is the amount
of runtime spent on thunks[i] in the course of computations performed by
call_with_timers().
"""
def __init__(self, nodes, thunks, pre_call_clear):
"""
Allocate a virtual machine.
nodes - a list of nodes in toposort order
thunks - a list of thunks to execute those nodes, in toposort order
pre_call_clear - a list of containers to empty at the beginning of each
call.
"""
if len(nodes) != len(thunks):
raise ValueError()
self.nodes = nodes
self.thunks = thunks
self.pre_call_clear = pre_call_clear
self.call_counts = [0]*len(nodes)
self.call_times = [0]*len(nodes)
self.time_thunks = False
def __call__(self):
"""
Run the machine.
Postcondition - all output variables have been computed. VMs vary in
what exactly this means and how it is done.
"""
raise NotImplementedError('override me')
def clear_storage(self):
"""
Free any internal references to temporary variables.
Free internal variables and outputs. Essentially, free as much memory
as possible without intefering with the ability to evaluate subsequent
calls.
"""
raise NotImplementedError('override me')
def update_profile(self, profile):
# accumulate into the profile object
for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts):
profile.apply_time.setdefault(node,0.0)
profile.apply_time[node] += t
profile.apply_callcount.setdefault(node,0)
profile.apply_callcount[node] = c
profile.apply_cimpl[node] = hasattr(thunk,'cthunk')
# clear the timer info out of the buffers
for i in range(len(self.call_times)):
self.call_times[i] = 0.0
self.call_counts[i] = 0
class Loop(VM):
"""
Unconditional start-to-finish program execution in Python.
No garbage collection is allowed on intermediate results.
"""
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
cont[0] = None
try:
for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)):
t0 = time.time()
thunk()
t1 = time.time()
self.call_counts[i] += 1
self.call_times[i] += t1 - t0
except:
raise_with_op(node)
else:
for cont in self.pre_call_clear:
cont[0] = None
try:
for thunk, node in zip(self.thunks, self.nodes):
thunk()
except:
raise_with_op(node)
class LoopGC(VM):
"""
Unconditional start-to-finish program execution in Python.
Garbage collection is possible on intermediate results.
"""
def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
self.post_thunk_clear = post_thunk_clear
if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
raise ValueError()
def __call__(self):
if self.time_thunks:
for cont in self.pre_call_clear:
cont[0] = None
try:
i = 0
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
t0 = time.time()
thunk()
t1 = time.time()
self.call_counts[i] += 1
self.call_times[i] += t1 - t0
for old_s in old_storage:
old_s[0] = None
i += 1
except:
raise_with_op(node)
else:
for cont in self.pre_call_clear:
cont[0] = None
try:
for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
thunk()
for old_s in old_storage:
old_s[0] = None
except:
raise_with_op(node)
class Stack(VM):
"""
Finish-to-start evalution order of thunks.
This supports lazy evaluation of subtrees and partial
computations of graphs when only some inputs have changed.
"""
def __init__(self, nodes, thunks, pre_call_clear,
storage_map, compute_map,
env, allow_gc):
super(Stack, self).__init__(nodes, thunks, pre_call_clear)
self.allow_gc = allow_gc
self.message = ""
self.base_apply_stack = [o.owner for o in env.outputs if o.owner]
self.outputs = env.outputs
self.storage_map = storage_map
self.apply_time = {}
self.outputs_size = {}
self.compute_map = compute_map
self.node_idx = node_idx = {}
ords = env.orderings()
for i, node in enumerate(self.nodes):
node_idx[node] = i
self.apply_time[node] = 0
self.outputs_size[node] = []
node.destroy_dependencies = []
if node in ords:
for prereq in ords[node]:
node.destroy_dependencies += prereq.outputs
dependencies = self.dependencies = {}
for k in storage_map:
dependencies[k] = []
if k.owner and k.clients:
ls = []
is_output = 0
for cl in k.clients:
if cl[0] is not 'output':
ls += cl[0].outputs
dependencies[k] += ls
if config.profile:
self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16}
atexit.register(self.atexit_print_all)
def __call__(self):
storage_map = self.storage_map
compute_map = self.compute_map
thunks = self.thunks
dependencies = self.dependencies
for k in self.storage_map:
compute_map[k][0] = (k.owner is None)
# apply_stack contains nodes
apply_stack = list(self.base_apply_stack)
last_apply_stack_len = -1
ls = []
while apply_stack:
# Make sure something happened last time round.
# This is just a safety check to make sure the op is written correctly
# apply_stack should either decrease in length by one (a thunk successfully applied), or
# increase in length (added dependencies over and above the original).
# NB: this doesn't catch cycles (would be too expensive/slow), just stalls.
apply_stack_len = len(apply_stack)
assert apply_stack_len != last_apply_stack_len
last_apply_stack_len = apply_stack_len
current_apply = apply_stack.pop()
# Use these for loops + breaks to short circuit evaluation
# This is a significant performance point
computed_ins = True
for i in current_apply.inputs:
if not compute_map[i][0]:
computed_ins = False
break
computed_outs = True
for o in current_apply.outputs:
if not compute_map[o][0]:
computed_outs = False
break
if computed_ins:
for d in current_apply.destroy_dependencies:
if not compute_map[d][0]:
computed_ins = False
break
if not thunks[self.node_idx[current_apply]].lazy:
# Check if all inputs are in place
# If so compute thunk and remove it from the apply_stack
# If not leave it in, and add to the apply_stack those that will
# produce you those inputs
if computed_ins and not computed_outs:
try:
t0 = time.time()
thunks[self.node_idx[current_apply]]()
if config.profile:
dt = time.time() - t0
self.apply_time[current_apply] += dt
## Computing the memory footprint of the the op
# ?? What about inplace .. if the op is inplace
# you don't actually ask for more memory!
size = []
for (idx,o) in enumerate(
thunks[self.node_idx[current_apply]].outputs):
if not hasattr(o[0],'size'):
size.append(-1)
continue
s=o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-3:]
s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
size.append(s)
self.outputs_size[current_apply] = size
except Exception:
raise_with_op(current_apply)
for o in current_apply.outputs:
compute_map[o][0] = 1
# Garbage Collection -> check if anybody else uses this input
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner
and i not in self.outputs):
empty_storage_map = True
for x in dependencies[i]:
if not compute_map[x][0]:
empty_storage_map = False
break
if empty_storage_map:
storage_map[i][0] = None
elif not computed_ins:
apply_stack.append(current_apply)
apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner)
apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner)
elif not computed_outs:
# Try and run it to see if it works
try:
t0 = time.time()
requires = thunks[self.node_idx[current_apply]]()
dt = time.time() - t0
self.apply_time[current_apply] += dt
except Exception:
raise_with_op(current_apply)
if requires:
for r in requires:
# We are not done with this op ..
# so we added back and see to get the inputs we are missing
apply_stack.append(current_apply)
if current_apply.inputs[r].owner:
apply_stack.append(current_apply.inputs[r].owner)
else:
if config.profile:
size = []
for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs):
if not hasattr(o[0],'size'):
size.append(-1)
continue
s=o[0].size
dtype = str(o[0].dtype)
dtype2 = dtype[-2:]
s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
size.append(s)
self.outputs_size[current_apply] = size
if self.allow_gc:
for i in current_apply.inputs:
if (dependencies[i] and i.owner and
i not in self.outputs):
empty_storage_map = True
for x in dependencies[i]:
if not compute_map[x][0]:
empty_storage_map = False
break
if empty_storage_map:
storage_map[i][0] = None
try:
import lazylinker_c
class CVM(lazylinker_c.CLazyLinker, VM):
def __init__(self, *args, **kwargs):
lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
# skip VM.__init__
except ImportError:
pass
class VM_Linker(link.LocalLinker):
"""
Class that satisfies the Linker interface by acting as a VM factory.
"""
def __init__(self, allow_gc=True, use_cloop = False):
self.env = None
self.allow_gc = allow_gc
self.use_cloop=use_cloop
def accept(self, env, no_recycling = []):
"""
:param env: a PerformLinker can have accepted one Env instance at a time.
:param no_recycling: WRITEME
:returns: self (TODO: WHY? Who calls this function?)
"""
if self.env is not None and self.env is not env:
return type(self)().accept(env, no_recycling)
self.env = env
self.no_recycling = no_recycling
return self
def make_vm(self, nodes, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
):
pre_call_clear = [storage_map[v] for v in self.no_recycling]
if self.use_cloop:
# create a map from nodes to ints and vars to ints
nodes_idx = {}
vars_idx = {}
for i, node in enumerate(nodes):
nodes_idx[node] = i
for v in node.inputs + node.outputs:
vars_idx.setdefault(v, len(vars_idx))
for v in self.env.inputs + self.env.outputs:
vars_idx.setdefault(v, len(vars_idx))
nodes_idx_inv = {}
vars_idx_inv = {}
for (node,i) in nodes_idx.items():
nodes_idx_inv[i] = node
for (var,i) in vars_idx.items():
vars_idx_inv[i] = var
# put storage_map and compute_map into a int-based scheme
n_applies = len(nodes)
storage_map_list = [storage_map[vars_idx_inv[i]]
for i in range(len(vars_idx_inv))]
compute_map_list = [compute_map[vars_idx_inv[i]]
for i in range(len(vars_idx_inv))]
if nodes:
assert type(storage_map_list[0]) is list
assert type(compute_map_list[0]) is list
# build the pointers to node inputs and offsets
base_input_output_list = []
node_n_inputs = []
node_n_outputs = []
node_input_offset = []
node_output_offset = []
for node in nodes:
inputs_idx = [vars_idx[v] for v in node.inputs]
outputs_idx = [vars_idx[v] for v in node.outputs]
node_n_inputs.append(len(inputs_idx))
node_n_outputs.append(len(outputs_idx))
node_input_offset.append(len(base_input_output_list))
base_input_output_list.extend(inputs_idx)
node_output_offset.append(len(base_input_output_list))
base_input_output_list.extend(outputs_idx)
# build the var owner array
var_owner = [None]*len(vars_idx)
for (var,i) in vars_idx.items():
if var.owner:
var_owner[i] = nodes_idx[var.owner]
is_lazy_list = [int(th.lazy) for th in thunks]
output_vars = [vars_idx[v] for v in self.env.outputs]
# builds the list of prereqs induced by e.g. destroy_handler
ords = self.env.orderings()
node_prereqs = []
node_output_size = []
for i, node in enumerate(nodes):
node_output_size.append(0)
prereq_var_idxs = []
for prereq_node in ords.get(node,[]):
prereq_var_idxs.extend(
[vars_idx[v] for v in prereq_node.outputs])
prereq_var_idxs = list(set(prereq_var_idxs))
prereq_var_idxs.sort() # TODO: why sort?
node_prereqs.append(prereq_var_idxs)
c0 = sys.getrefcount(node_n_inputs)
vm = CVM(
nodes,
thunks,
pre_call_clear,
allow_gc=self.allow_gc,
call_counts=[0]*len(nodes),
call_times=[0.0]*len(nodes),
compute_map_list=compute_map_list,
base_input_output_list=base_input_output_list,
node_n_inputs=node_n_inputs,
node_n_outputs=node_n_outputs,
node_input_offset=node_input_offset,
node_output_offset=node_output_offset,
var_owner=var_owner,
is_lazy_list=is_lazy_list,
output_vars=output_vars,
node_prereqs=node_prereqs,
node_output_size=node_output_size,
)
assert c0 == sys.getrefcount(node_n_inputs)
else:
if all([(not th.lazy) for th in thunks]):
# there is no conditional in the graph
if self.allow_gc:
vm = LoopGC(
nodes,
thunks,
pre_call_clear,
post_thunk_clear)
else:
vm = Loop(
nodes,
thunks,
pre_call_clear)
else:
vm = Stack(
nodes, thunks, pre_call_clear,
storage_map, compute_map,
self.env, self.allow_gc
)
return vm
def make_all(self, profiler = None, input_storage = None, output_storage = None):
env = self.env
order = list(env.toposort())
no_recycling = self.no_recycling
input_storage, output_storage, storage_map = link.map_storage(
env, order, input_storage, output_storage)
compute_map = {}
for k in storage_map:
compute_map[k] = [k.owner is None]
thunks = [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)
for node in order]
computed, last_user = link.gc_helper(order)
if self.allow_gc:
post_thunk_clear = []
for node in order:
clear_after_this_thunk = []
for input in node.inputs:
if ((input in computed)
and (input not in env.outputs)
and (node == last_user[input])):
clear_after_this_thunk.append(storage_map[input])
post_thunk_clear.append(clear_after_this_thunk)
else:
post_thunk_clear = None
vm = self.make_vm(order, thunks,
input_storage, output_storage, storage_map,
post_thunk_clear,
computed,
compute_map
)
return (vm,
[link.Container(input, storage)
for input, storage in zip(env.inputs, input_storage)],
[link.Container(output, storage, True)
for output, storage in zip(env.outputs, output_storage)],
thunks,
order)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论