vm linker

9433d5d2 · James Bergstra · 2677e15f · 9433d5d2 · 9433d5d2 · 9433d5d2
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -331,6 +331,7 @@ class Function(object):
        self.unpack_single = unpack_single
        self.return_none = return_none
        self.maker = maker
+        self.profile = None # reassigned in FunctionMaker.create

        # We will be popping stuff off this `containers` object.  It is a copy.
        containers = list(self.input_storage)

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -4,7 +4,9 @@ import os, logging

 import numpy, theano
 from theano import gof
-from theano.configparser import config, AddConfigVar, StrParam
+import theano.gof.vm
+from theano.configparser import config, AddConfigVar, StrParam, EnumStr
+

 _logger = logging.getLogger('theano.compile.mode')

@@ -55,7 +57,11 @@ predefined_linkers = {
    'c'    : gof.CLinker(),
    'c|py' : gof.OpWiseCLinker(allow_gc=True),
    'c|py_nogc' : gof.OpWiseCLinker(allow_gc=False),
-    'c&py' : gof.DualLinker(checker = check_equal)
+    'c&py' : gof.DualLinker(checker = check_equal),
+    'vm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=False),
+    'cvm'   : gof.vm.VM_Linker(allow_gc=True, use_cloop=True),
+    'vm_nogc' : gof.vm.VM_Linker(allow_gc=False, use_cloop=False),
+    'cvm_nogc': gof.vm.VM_Linker(allow_gc=False, use_cloop=True),
    }


@@ -249,6 +255,7 @@ class Mode(object):
        self._optimizer = optimizer
        self.call_time = 0
        self.fn_time = 0
+        linker.mode = self #TODO: WHY IS THIS HERE?
        self.optimizer_time = 0
        self.linker_time = 0

@@ -290,15 +297,27 @@ class Mode(object):
 FAST_COMPILE = Mode('py', 'fast_compile')
 FAST_RUN = Mode('c|py', 'fast_run')
 FAST_RUN_NOGC = Mode("c|py_nogc", 'fast_run')
-SANITY_CHECK = [Mode('c|py', None),
-                Mode('c|py', 'fast_run')]
 STABILIZE = Mode("c|py", OPT_STABILIZE)

 predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
                    'FAST_RUN': FAST_RUN,
                    'FAST_RUN_NOGC':FAST_RUN_NOGC,
-                    'SANITY_CHECK': SANITY_CHECK,
-                    'STABILIZE': STABILIZE}
+                    'STABILIZE': STABILIZE,
+                    'VM':Mode('vm', 'fast_run'),
+                    'VM_NOGC':Mode('vm_nogc', 'fast_run'),
+                    'CVM':Mode('cvm', 'fast_run'),
+                    'CVM_NOGC':Mode('cvm_nogc', 'fast_run'),
+                    }
+
+#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
+#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
+#The old all capital letter way of working is deprecated as it is not scalable.
+AddConfigVar('mode',
+        "Default compilation mode",
+        EnumStr(*(predefined_modes.keys() + [
+            'Mode','DEBUG_MODE', 'PROFILE_MODE'])),
+        in_c_key=False)
+

 instanciated_default_mode=None
 def get_mode(orig_string):
@@ -329,7 +348,7 @@ def get_mode(orig_string):
            ret = DebugMode(optimizer=config.optimizer)
        else:
            # The import is needed in case string is ProfileMode
-            from profilemode import ProfileMode
+            from profilemode import ProfileMode,prof_mode_instance_to_print
            ret = eval(string+'(linker=config.linker, optimizer=config.optimizer)')
    elif predefined_modes.has_key(string):
        ret = predefined_modes[string]
@@ -349,7 +368,6 @@ def get_mode(orig_string):
    #must tell python to print the summary at the end.
    if string == 'ProfileMode':
        #need to import later to break circular dependency.
-        from profilemode import prof_mode_instance_to_print
        prof_mode_instance_to_print.append(ret)

    return ret
@@ -365,3 +383,4 @@ def register_mode(name, mode):
    if name in predefined_modes:
        raise ValueError('Mode name already taken: %s' % name)
    predefined_modes[name] = mode
+
--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
@@ -10,6 +10,8 @@ import random
 import numpy.random
 from theano.tests  import unittest_tools as utt

+import theano.tensor as T
+

 class T_bunch_of_modes(unittest.TestCase):


--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -65,15 +65,6 @@ AddConfigVar('force_device',
        BoolParam(False, allow_override=False),
        in_c_key=False)

-#Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
-#The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
-#The old all capital letter way of working is deprecated as it is not scalable.
-AddConfigVar('mode',
-        "Default compilation mode",
-        EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
-                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
-        in_c_key=False)
-
 # Test whether or not gcc is present: disable C code if it is not.
 # Using the dummy file descriptor below is a workaround for a crash experienced
 # in an unusual Python 2.4.4 Windows environment with the default stdin=None.
@@ -84,13 +75,15 @@ try:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
 except OSError:
    # gcc is not present, linker should default to python only
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py',
+                     'vm', 'cvm', 'vm_nogc', 'cvm_nogc'),
                 in_c_key=False)
    warning('GCC not detected ! Theano will be unable to execute optimized '+
            'C-implementations (for both CPU and GPU) and will default to '+

--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
+#include <Python.h>
+#include "structmember.h"
+/**
+
+TODO: 
+- Check max supported depth of recursion
+- CLazyLinker should add context information to errors caught during evaluation. Say what node we were on, add the traceback attached to the node.
+- Clear containers of fully-useed intermediate results if allow_gc is 1
+- Add timers for profiling
+- Add support for profiling space used.
+
+
+  */
+#include <time.h>
+static double pytime(const struct timeval * tv)
+{
+  struct timeval t;
+  if (!tv)
+    {
+      tv = &t;
+      gettimeofday(&t, NULL);
+    }
+  return (double) tv->tv_sec + (double) tv->tv_usec / 1000000.0;
+}
+
+/**
+
+  CLazyLinker
+
+
+  */
+typedef struct {
+    PyObject_HEAD
+    /* Type-specific fields go here. */
+    PyObject * nodes; // the python list of nodes
+    PyObject * thunks; // python list of thunks
+    PyObject * pre_call_clear; //list of cells to clear on call.
+    int allow_gc;
+    Py_ssize_t n_applies;
+    int n_vars;    // number of variables in the graph
+    int * var_computed; // 1 or 0 for every variable
+    PyObject ** var_computed_cells;
+
+    Py_ssize_t n_output_vars;
+    Py_ssize_t * output_vars; // variables that *must* be evaluated by call
+
+    int * is_lazy; // 1 or 0 for every thunk
+
+    Py_ssize_t * var_owner; // nodes[[var_owner[var_idx]]] is var[var_idx]->owner
+    int * var_has_owner; //  1 or 0
+
+    Py_ssize_t * node_n_inputs;
+    Py_ssize_t * node_n_outputs;
+    Py_ssize_t ** node_inputs;
+    Py_ssize_t ** node_outputs;
+    Py_ssize_t * node_inputs_outputs_base; // node_inputs and node_outputs point into this
+    Py_ssize_t * node_n_prereqs;
+    Py_ssize_t ** node_prereqs;
+
+
+    void ** thunk_cptr_fn;
+    void ** thunk_cptr_data;
+    PyObject * call_times;
+    PyObject * call_counts;
+    int do_timing;
+
+    int position_of_error; // -1 for no error, otw the index into `thunks` that failed.
+} CLazyLinker;
+
+
+static void
+CLazyLinker_dealloc(PyObject* _self)
+{
+  CLazyLinker* self = (CLazyLinker *) _self;
+  free(self->thunk_cptr_fn);
+  free(self->thunk_cptr_data);
+
+  free(self->is_lazy);
+
+  if (self->node_n_prereqs)
+    {
+      for (int i = 0; i < self->n_applies; ++i)
+        {
+          free(self->node_prereqs[i]);
+        }
+    }
+  free(self->node_n_prereqs);
+  free(self->node_prereqs);
+  free(self->node_inputs_outputs_base);
+  free(self->node_n_inputs);
+  free(self->node_n_outputs);
+  free(self->node_inputs);
+  free(self->node_outputs);
+
+  free(self->var_owner);
+  free(self->var_has_owner);
+  free(self->var_computed);
+  if (self->var_computed_cells)
+    {
+      for (int i = 0; i < self->n_vars; ++i)
+        {
+          Py_DECREF(self->var_computed_cells[i]);
+        }
+    }
+  free(self->var_computed_cells);
+  free(self->output_vars);
+
+  Py_XDECREF(self->nodes);
+  Py_XDECREF(self->thunks);
+  Py_XDECREF(self->call_times);
+  Py_XDECREF(self->call_counts);
+  Py_XDECREF(self->pre_call_clear);
+  self->ob_type->tp_free((PyObject*)self);
+}
+static PyObject *
+CLazyLinker_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    CLazyLinker *self;
+
+    self = (CLazyLinker *)type->tp_alloc(type, 0);
+    if (self != NULL) {
+      self->nodes = NULL;
+      self->thunks = NULL;
+      self->pre_call_clear = NULL;
+
+      self->allow_gc = 1;
+      self->n_applies = 0;
+      self->n_vars = 0;
+      self->var_computed = NULL;
+      self->var_computed_cells = NULL;
+
+      self->n_output_vars = 0;
+      self->output_vars = NULL;
+
+      self->is_lazy = NULL;
+
+      self->var_owner = NULL;
+      self->var_has_owner = NULL;
+
+      self->node_n_inputs = NULL;
+      self->node_n_outputs = NULL;
+      self->node_inputs = NULL;
+      self->node_outputs = NULL;
+      self->node_inputs_outputs_base = NULL;
+      self->node_prereqs = NULL;
+      self->node_n_prereqs = NULL;
+
+      self->thunk_cptr_data = NULL;
+      self->thunk_cptr_fn = NULL;
+      self->call_times = NULL;
+      self->call_counts = NULL;
+      self->do_timing = 0;
+
+      self->position_of_error = -1;
+    }
+    return (PyObject *)self;
+}
+
+static int
+CLazyLinker_init(CLazyLinker *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {
+      (char*)"nodes",
+      (char*)"thunks",
+      (char*)"pre_call_clear",
+      (char*)"allow_gc",
+      (char*)"call_counts",
+      (char*)"call_times",
+      (char*)"compute_map_list",
+      (char*)"base_input_output_list",
+      (char*)"node_n_inputs",
+      (char*)"node_n_outputs",
+      (char*)"node_input_offset",
+      (char*)"node_output_offset",
+      (char*)"var_owner",
+      (char*)"is_lazy_list",
+      (char*)"output_vars",
+      (char*)"node_prereqs",
+      (char*)"node_output_size",
+      NULL};
+
+    PyObject *compute_map_list=NULL,
+             *base_input_output_list=NULL,
+             *node_n_inputs=NULL,
+             *node_n_outputs=NULL,
+             *node_input_offset=NULL,
+             *node_output_offset=NULL,
+             *var_owner=NULL,
+             *is_lazy=NULL,
+             *output_vars=NULL,
+             *node_prereqs=NULL,
+             *node_output_size=NULL;
+
+    assert(!self->nodes);
+    if (! PyArg_ParseTupleAndKeywords(args, kwds, "OOOiOOOOOOOOOOOOO", kwlist,
+                                      &self->nodes,
+                                      &self->thunks,
+                                      &self->pre_call_clear,
+                                      &self->allow_gc,
+                                      &self->call_counts,
+                                      &self->call_times,
+                                      &compute_map_list,
+                                      &base_input_output_list,
+                                      &node_n_inputs,
+                                      &node_n_outputs,
+                                      &node_input_offset,
+                                      &node_output_offset,
+                                      &var_owner,
+                                      &is_lazy,
+                                      &output_vars,
+                                      &node_prereqs,
+                                      &node_output_size
+                                      ))
+        return -1;
+    Py_INCREF(self->nodes);
+    Py_INCREF(self->thunks);
+    Py_INCREF(self->pre_call_clear);
+    Py_INCREF(self->call_counts);
+    Py_INCREF(self->call_times);
+
+    Py_ssize_t n_applies = PyList_Size(self->nodes);
+
+    self->n_applies = n_applies;
+    self->n_vars = PyList_Size(var_owner);
+
+    if (PyList_Size(self->thunks) != n_applies) return -1;
+    if (PyList_Size(self->call_counts) != n_applies) return -1;
+    if (PyList_Size(self->call_times) != n_applies) return -1;
+
+    // allocated and initialize thunk_cptr_data and thunk_cptr_fn
+    if (n_applies)
+      {
+        self->thunk_cptr_data = (void**)malloc(n_applies * sizeof(void*));
+        self->thunk_cptr_fn = (void**)malloc(n_applies * sizeof(void*));
+        self->is_lazy = (int*)malloc(n_applies * sizeof(int));
+        self->node_prereqs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        self->node_n_prereqs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_prereqs);
+        assert(self->node_n_prereqs);
+        assert(self->is_lazy);
+        assert(self->thunk_cptr_fn);
+        assert(self->thunk_cptr_data);
+
+        // init these basic arrays
+        for (int i = 0; i < n_applies; ++i)
+          {
+            self->thunk_cptr_data[i] = NULL;
+            self->thunk_cptr_fn[i] = NULL;
+            self->is_lazy[i] = 1;
+            self->node_prereqs[i] = NULL;
+            self->node_n_prereqs[i] = 0;
+          }
+
+        for (int i = 0; i < n_applies; ++i)
+          {
+            PyObject * thunk = PyList_GetItem(self->thunks, i);
+            //thunk is borrowed
+            if (PyObject_HasAttrString(thunk, "cthunk"))
+              {
+                PyObject * cthunk = PyObject_GetAttrString(thunk, "cthunk");
+                //new reference
+                assert (cthunk && PyCObject_Check(cthunk));
+                self->thunk_cptr_fn[i] = PyCObject_AsVoidPtr(cthunk);
+                self->thunk_cptr_data[i] = PyCObject_GetDesc(cthunk);
+                Py_DECREF(cthunk);
+                // cthunk is kept alive by membership in self->thunks
+              }
+            else
+              {
+                self->thunk_cptr_fn[i] = NULL;
+                self->thunk_cptr_data[i] = NULL;
+              }
+
+            PyObject * el_i = PyList_GetItem(is_lazy, i);
+            self->is_lazy[i] = PyNumber_AsSsize_t(el_i, NULL);
+
+            /* now get the prereqs */
+            el_i = PyList_GetItem(node_prereqs, i);
+            assert (PyList_Check(el_i));
+            self->node_n_prereqs[i] = PyList_Size(el_i);
+            if (self->node_n_prereqs[i])
+              {
+                self->node_prereqs[i] = (Py_ssize_t*)malloc(
+                              PyList_Size(el_i)*sizeof(Py_ssize_t));
+                for (int j = 0; j < PyList_Size(el_i); ++j)
+                  {
+                    PyObject * el_ij = PyList_GetItem(el_i, j);
+                    Py_ssize_t N = PyNumber_AsSsize_t(el_ij, PyExc_IndexError);
+                    if (PyErr_Occurred())
+                      return -1;
+                    // N < n. variables
+                    assert(N < PyList_Size(var_owner));
+                    self->node_prereqs[i][j] = N;
+                  }
+              }
+          }
+      }
+    if (PyList_Check(base_input_output_list))
+      {
+        Py_ssize_t n_inputs_outputs_base = PyList_Size(base_input_output_list);
+        self->node_inputs_outputs_base = (Py_ssize_t*)malloc(n_inputs_outputs_base*sizeof(Py_ssize_t));
+        assert(self->node_inputs_outputs_base);
+        for (int i = 0; i < n_inputs_outputs_base; ++i)
+          {
+            PyObject *el_i = PyList_GetItem(base_input_output_list, i);
+            Py_ssize_t idx = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            self->node_inputs_outputs_base[i] = idx;
+          }
+        self->node_n_inputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_n_inputs);
+        self->node_n_outputs = (Py_ssize_t*)malloc(n_applies*sizeof(Py_ssize_t));
+        assert(self->node_n_outputs);
+        self->node_inputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        assert(self->node_inputs);
+        self->node_outputs = (Py_ssize_t**)malloc(n_applies*sizeof(Py_ssize_t*));
+        assert(self->node_outputs);
+        for (int i = 0; i < n_applies; ++i)
+          {
+            Py_ssize_t N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_n_inputs, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_n_inputs[i] = N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_n_outputs, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_n_outputs[i] = N;
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_input_offset, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_inputs[i] = &self->node_inputs_outputs_base[N];
+            N = PyNumber_AsSsize_t(PyList_GetItem(node_output_offset, i),PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= n_inputs_outputs_base);
+            self->node_outputs[i] = &self->node_inputs_outputs_base[N];
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "base_input_output_list must be list");
+        return -1;
+      }
+
+    // allocation for var_owner
+    if (PyList_Check(var_owner))
+      {
+        self->var_owner = (Py_ssize_t*)malloc(self->n_vars*sizeof(Py_ssize_t));
+        self->var_has_owner = (int*)malloc(self->n_vars*sizeof(int));
+        self->var_computed = (int*)malloc(self->n_vars*sizeof(int));
+        self->var_computed_cells = (PyObject**)malloc(self->n_vars*sizeof(PyObject*));
+        for (int i = 0; i < self->n_vars; ++i)
+          {
+            PyObject * el_i = PyList_GetItem(var_owner, i);
+            if (el_i == Py_None)
+              {
+                self->var_has_owner[i] = 0;
+              }
+            else
+              {
+                Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                if (PyErr_Occurred()) return -1;
+                assert (N <= n_applies);
+                self->var_owner[i] = N;
+                self->var_has_owner[i] = 1;
+              }
+            self->var_computed_cells[i] = PyList_GetItem(compute_map_list, i);
+            Py_INCREF(self->var_computed_cells[i]);
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "var_owner must be list");
+        return -1;
+      }
+
+    //output vars
+    if (PyList_Check(output_vars))
+      {
+        self->n_output_vars = PyList_Size(output_vars);
+        self->output_vars = (Py_ssize_t*)malloc(self->n_output_vars*sizeof(Py_ssize_t));
+        assert(self->output_vars);
+        for (int i = 0; i < self->n_output_vars; ++i)
+          {
+            PyObject * el_i = PyList_GetItem(output_vars, i);
+            Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+            if (PyErr_Occurred()) return -1;
+            assert (N <= self->n_vars);
+            self->output_vars[i] = N;
+          }
+      }
+    else
+      {
+        PyErr_SetString(PyExc_TypeError, "output_vars must be list");
+        return -1;
+      }
+    return 0;
+}
+static void set_position_of_error(CLazyLinker * self, int owner_idx)
+{
+  if (self->position_of_error == -1)
+    {
+      self->position_of_error = owner_idx;
+    }
+}
+static PyObject * pycall(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
+{
+  // call thunk to see which inputs it wants
+  PyObject * thunk = PyList_GetItem(self->thunks, node_idx);
+  // refcounting - thunk is borrowed
+  PyObject * rval = NULL;
+  if (self->do_timing)
+    {
+      double t0 = pytime(NULL);
+      if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
+      rval = PyObject_CallObject(thunk, NULL);
+      double t1 = pytime(NULL);
+      double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
+      PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
+      PyObject * count = PyList_GetItem(self->call_counts, node_idx);
+      long icount = PyInt_AsLong(count);
+      PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
+    }
+  else
+    {
+      if (verbose) fprintf(stderr, "calling via Python (node %i)\n", (int)node_idx);
+      rval = PyObject_CallObject(thunk, NULL);
+    }
+  return rval;
+}
+static int c_call(CLazyLinker * self, Py_ssize_t node_idx, int verbose)
+{
+  void * ptr_addr = self->thunk_cptr_fn[node_idx];
+  int (*fn)(void*) = (int (*)(void*))(ptr_addr);
+  if (verbose) fprintf(stderr, "calling non-lazy shortcut (node %i)\n", (int)node_idx);
+  int err = 0;
+  if (self->do_timing)
+    {
+      double t0 = pytime(NULL);
+      err = fn(self->thunk_cptr_data[node_idx]);
+      double t1 = pytime(NULL);
+      double ti = PyFloat_AsDouble(PyList_GetItem(self->call_times, node_idx));
+      PyList_SetItem(self->call_times, node_idx, PyFloat_FromDouble(t1 - t0 + ti));
+      PyObject * count = PyList_GetItem(self->call_counts, node_idx);
+      long icount = PyInt_AsLong(count);
+      PyList_SetItem(self->call_counts, node_idx, PyInt_FromLong(icount+1));
+    }
+  else
+    {
+      err = fn(self->thunk_cptr_data[node_idx]);
+    }
+
+  if (err)
+    {
+      // cast the argument to a PyList (as described near line 226 of cc.py)
+      PyObject * __ERROR = ((PyObject**)self->thunk_cptr_data[node_idx])[0];
+      assert (PyList_Check(__ERROR));
+      assert (PyList_Size(__ERROR) == 3);
+      PyObject * err_type = PyList_GetItem(__ERROR, 0); //stolen ref
+      PyObject * err_msg = PyList_GetItem(__ERROR, 1); //stolen ref
+      PyObject * err_trace = PyList_GetItem(__ERROR, 2); //stolen ref
+      PyList_SET_ITEM(__ERROR, 0, Py_None); Py_INCREF(Py_None); //clobbers old ref
+      PyList_SET_ITEM(__ERROR, 1, Py_None); Py_INCREF(Py_None); //clobbers old ref
+      PyList_SET_ITEM(__ERROR, 2, Py_None); Py_INCREF(Py_None); //clobbers old ref
+
+      assert(!PyErr_Occurred()); // because CLinker hid the exception in __ERROR aka data
+      PyErr_Restore(err_type, err_msg, err_trace); //steals refs to args
+    }
+  if (err) set_position_of_error(self, node_idx);
+  return err;
+}
+static
+int lazy_rec_eval(CLazyLinker * self, Py_ssize_t var_idx, PyObject*one, PyObject*zero)
+{
+  int verbose = 0;
+  if (verbose) fprintf(stderr, "lazy_rec computing %i\n", (int)var_idx);
+  int err = 0;
+  if (self->var_computed[var_idx] || !self->var_has_owner[var_idx])
+    {
+      return 0;
+    }
+  else
+    {
+      Py_ssize_t owner_idx = self->var_owner[var_idx];
+
+      // STEP 1: compute the pre-requirements of the node
+      for (int i = 0; i < self->node_n_prereqs[owner_idx]; ++i)
+        {
+          Py_ssize_t prereq_idx = self->node_prereqs[owner_idx][i];
+          if (!self->var_computed[prereq_idx])
+            {
+              err = lazy_rec_eval(self, prereq_idx, one, zero);
+              if (err) return err;
+            }
+          assert (self->var_computed[prereq_idx]);
+        }
+
+      // STEP 2: compute the node itself
+      if (self->is_lazy[owner_idx])
+        {
+          // update the compute_map cells corresponding to the inputs of this thunk
+          for (int i = 0; i < self->node_n_inputs[owner_idx] && (!err); ++i)
+            {
+              int in_idx = self->node_inputs[owner_idx][i];
+              if (self->var_computed[in_idx])
+                {
+                  Py_INCREF(one);
+                  err = PyList_SetItem(self->var_computed_cells[in_idx], 0, one);
+                }
+              else
+                {
+                  Py_INCREF(zero);
+                  err = PyList_SetItem(self->var_computed_cells[in_idx], 0, zero);
+                }
+            }
+          if (err)
+            {
+              set_position_of_error(self, owner_idx);
+              return err;
+            }
+
+          PyObject * rval = pycall(self, owner_idx, verbose);
+          // refcounting - rval is new ref
+          //TODO: to prevent infinite loops
+          // - consider check that a thunk does not ask for an input that is already computed
+          if (rval) //call returned normally (no exception)
+            {
+              //update the computed-ness of any output cells
+              for (int i = 0; i < self->node_n_outputs[owner_idx]; ++i)
+                {
+                  int out_idx = self->node_outputs[owner_idx][i];
+                  PyObject * el_i = PyList_GetItem(self->var_computed_cells[out_idx], 0);
+                  Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                  if (PyErr_Occurred())
+                    {
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return -1;
+                    }
+                  assert (N==0 || N==1);
+                  self->var_computed[out_idx] = N;
+                }
+              if (!self->var_computed[var_idx])
+                {
+                  if (PyList_Check(rval))
+                    {
+                      if (PyList_Size(rval))
+                        {
+                          for (int i = 0; i < PyList_Size(rval) && (!err); ++i)
+                            {
+                              PyObject * el_i = PyList_GetItem(rval, i);
+                              Py_ssize_t N = PyNumber_AsSsize_t(el_i, PyExc_IndexError);
+                              if (PyErr_Occurred())
+                                {
+                                  err = 1;
+                                }
+                              else
+                                {
+                                  assert (N <= self->node_n_inputs[owner_idx]);
+                                  Py_ssize_t input_idx = self->node_inputs[owner_idx][N];
+                                  err = lazy_rec_eval(self, input_idx, one, zero);
+                                }
+                            }
+                          if (!err)
+                            err = lazy_rec_eval(self, var_idx, one, zero);
+                        }
+                      else
+                        {
+                          PyErr_SetString(PyExc_ValueError,
+                                    "lazy thunk returned empty list without computing output");
+                          err = 1;
+                          set_position_of_error(self, owner_idx);
+                        }
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return err;
+                    }
+                  else // don't know what it returned, but it wasn't right.
+                    {
+                      //TODO: More helpful error to help find *which node* made this
+                      // bad thunk
+                      PyErr_SetString(PyExc_TypeError,
+                                      "lazy thunk should list");
+                      Py_DECREF(rval);
+                      set_position_of_error(self, owner_idx);
+                      return 1;
+                    }
+                }
+              Py_DECREF(rval);
+            }
+          else // pycall returned NULL (internal error)
+            {
+              assert (PyErr_Occurred());
+              set_position_of_error(self, owner_idx);
+              return 1;
+            }
+        }
+      else //owner is not a lazy op. Ensure all intputs are evaluated.
+        {
+          // loop over inputs to owner
+          // call lazy_rec_eval on each one that is not computed.
+          // if there's an error, pass it up the stack
+          for (int i = 0; i < self->node_n_inputs[owner_idx]; ++i)
+            {
+              Py_ssize_t input_idx = self->node_inputs[owner_idx][i];
+              if (!self->var_computed[input_idx])
+                {
+                  err = lazy_rec_eval(self, input_idx, one, zero);
+                  if (err) return err;
+                }
+              assert (self->var_computed[input_idx]);
+            }
+
+          // call the thunk for this owner.
+          if (self->thunk_cptr_fn[owner_idx])
+            {
+              err = c_call(self, owner_idx, verbose);
+            }
+          else
+            {
+              PyObject * rval = pycall(self, owner_idx, verbose);
+              //rval is new ref
+              if (rval) //pycall returned normally (no exception)
+                {
+                  if (rval == Py_None)
+                    {
+                      Py_DECREF(rval); //ignore a return of None
+                    }
+                  else if (PyList_Check(rval))
+                    {
+                      PyErr_SetString(PyExc_TypeError,
+                                      "non-lazy thunk should return None, not list");
+                      err=1;
+                      set_position_of_error(self, owner_idx);
+
+                      Py_DECREF(rval);
+                    }
+                  else // don't know what it returned, but it wasn't right.
+                    {
+                      PyErr_SetObject(PyExc_TypeError, rval);
+                      err=1;
+                      set_position_of_error(self, owner_idx);
+                    }
+                }
+              else // pycall returned NULL (internal error)
+                {
+                  err=1;
+                  set_position_of_error(self, owner_idx);
+                }
+            }
+        }
+
+      // loop over all outputs and mark them as computed
+      for (int i = 0; i < self->node_n_outputs[owner_idx] && (!err); ++i)
+        {
+          self->var_computed[self->node_outputs[owner_idx][i]] = 1;
+        }
+    }
+
+  return err;
+}
+PyObject *
+CLazyLinker_call(PyObject *_self, PyObject *args, PyObject *kwds)
+{
+  CLazyLinker * self = (CLazyLinker*)_self;
+  static char *kwlist[] = {(char*)"time_thunks", NULL};
+  if (! PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist,
+                                    &self->do_timing))
+    return NULL;
+  int err = 0;
+  self->position_of_error = -1;
+  PyObject * one = PyInt_FromLong(1);
+  PyObject * zero = PyInt_FromLong(0);
+  //clear storage of pre_call_clear elements
+  Py_ssize_t n_pre_call_clear = PyList_Size(self->pre_call_clear);
+  assert(PyList_Check(self->pre_call_clear));
+  for (int i = 0; i < n_pre_call_clear; ++i)
+    {
+      PyObject * el_i = PyList_GetItem(self->pre_call_clear, i);
+      Py_INCREF(Py_None);
+      PyList_SetItem(el_i, 0, Py_None);
+    }
+  //clear the computed flag out of all non-input vars
+  for (int i = 0; i < self->n_vars; ++i)
+    {
+      self->var_computed[i] = !self->var_has_owner[i];
+      if (self->var_computed[i])
+        {
+          Py_INCREF(one);
+          PyList_SetItem(self->var_computed_cells[i], 0, one);
+        }
+      else
+        {
+          Py_INCREF(zero);
+          PyList_SetItem(self->var_computed_cells[i], 0, zero);
+        }
+    }
+
+  for (int i = 0; i < self->n_output_vars && (!err); ++i)
+    {
+      err = lazy_rec_eval(self, self->output_vars[i], one, zero);
+    }
+  Py_DECREF(one);
+  Py_DECREF(zero);
+  if (err) return NULL;
+  Py_INCREF(Py_None);
+  return Py_None;
+}
+
+#if 0
+static PyMethodDef CLazyLinker_methods[] = {
+    {
+      //"name", (PyCFunction)CLazyLinker_accept, METH_VARARGS, "Return the name, combining the first and last name"
+    },
+    {NULL}  /* Sentinel */
+};
+#endif
+
+static PyMemberDef CLazyLinker_members[] = {
+    {(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,
+     (char*)"list of nodes"},
+    {(char*)"thunks", T_OBJECT_EX, offsetof(CLazyLinker, thunks), 0,
+     (char*)"list of thunks in program"},
+    {(char*)"call_counts", T_OBJECT_EX, offsetof(CLazyLinker, call_counts), 0,
+     (char*)"number of calls of each thunk"},
+    {(char*)"call_times", T_OBJECT_EX, offsetof(CLazyLinker, call_times), 0,
+     (char*)"total runtime in each thunk"},
+    {(char*)"position_of_error", T_INT, offsetof(CLazyLinker, position_of_error), 0,
+     (char*)"position of failed thunk"},
+    {(char*)"time_thunks", T_INT, offsetof(CLazyLinker, do_timing), 0,
+     (char*)"bool: nonzero means call will time thunks"},
+    {NULL}  /* Sentinel */
+};
+
+static PyTypeObject lazylinker_ext_CLazyLinkerType = {
+    PyObject_HEAD_INIT(NULL)
+    0,                         /*ob_size*/
+    "lazylinker_ext.CLazyLinker",             /*tp_name*/
+    sizeof(CLazyLinker), /*tp_basicsize*/
+    0,                         /*tp_itemsize*/
+    CLazyLinker_dealloc,       /*tp_dealloc*/
+    0,                         /*tp_print*/
+    0,                         /*tp_getattr*/
+    0,                         /*tp_setattr*/
+    0,                         /*tp_compare*/
+    0,                         /*tp_repr*/
+    0,                         /*tp_as_number*/
+    0,                         /*tp_as_sequence*/
+    0,                         /*tp_as_mapping*/
+    0,                         /*tp_hash */
+    CLazyLinker_call,          /*tp_call*/
+    0,                         /*tp_str*/
+    0,                         /*tp_getattro*/
+    0,                         /*tp_setattro*/
+    0,                         /*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE,        /*tp_flags*/
+    "CLazyLinker object",      /* tp_doc */
+    0,                         /* tp_traverse */
+    0,                         /* tp_clear */
+    0,                         /* tp_richcompare */
+    0,                         /* tp_weaklistoffset */
+    0,                         /* tp_iter */
+    0,                         /* tp_iternext */
+    0,//CLazyLinker_methods,       /* tp_methods */
+    CLazyLinker_members,       /* tp_members */
+    0,                         /* tp_getset */
+    0,                         /* tp_base */
+    0,                         /* tp_dict */
+    0,                         /* tp_descr_get */
+    0,                         /* tp_descr_set */
+    0,                         /* tp_dictoffset */
+    (initproc)CLazyLinker_init,/* tp_init */
+    0,                         /* tp_alloc */
+    CLazyLinker_new,           /* tp_new */
+};
+
+static PyMethodDef lazylinker_ext_methods[] = {
+    {NULL}  /* Sentinel */
+};
+
+#ifndef PyMODINIT_FUNC  /* declarations for DLL import/export */
+#define PyMODINIT_FUNC void
+#endif
+PyMODINIT_FUNC
+initlazylinker_ext(void) 
+{
+    PyObject* m;
+
+    lazylinker_ext_CLazyLinkerType.tp_new = PyType_GenericNew;
+    if (PyType_Ready(&lazylinker_ext_CLazyLinkerType) < 0)
+        return;
+
+    m = Py_InitModule3("lazylinker_ext", lazylinker_ext_methods,
+                       "Example module that creates an extension type.");
+
+    Py_INCREF(&lazylinker_ext_CLazyLinkerType);
+    PyModule_AddObject(m, "CLazyLinker", (PyObject *)&lazylinker_ext_CLazyLinkerType);
+}
+
--- a/theano/gof/lazylinker_c.py
+++ b/theano/gof/lazylinker_c.py
+import os
+import theano
+from theano import config
+from theano.gof.compilelock import get_lock, release_lock
+from theano.gof import cmodule
+
+get_lock()
+try:
+    dirname = 'lazylinker_ext'
+    cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
+    code = open(cfile).read()
+    loc = os.path.join(config.compiledir, dirname)
+    if not os.path.exists(loc):
+        os.mkdir(loc)
+    cmodule.gcc_module_compile_str(dirname, code, location=loc)
+    from lazylinker_ext.lazylinker_ext import *
+finally:
+    # Release lock on compilation directory.
+    release_lock()
+
--- a/theano/gof/tests/test_vm.py
+++ b/theano/gof/tests/test_vm.py
+import gc
+import sys
+import time
+import line_profiler
+import numpy
+
+from theano import function
+from theano.gof import vm,link, OpWiseCLinker
+from theano.compile import Mode
+
+from theano import tensor
+from theano.lazycond import cond
+import theano
+
+def test_speed():
+
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = (z + z)
+        return z
+
+    def numpy_version(x, depth):
+        z = x
+        for d in xrange(depth):
+            z = (z+z)
+        return z
+    def time_numpy():
+        steps_a = 5
+        steps_b = 100
+        x = numpy.asarray([2.0, 3.0], dtype=theano.config.floatX)
+
+        numpy_version(x, steps_a)
+        t0 = time.time()
+        print numpy_version(x, steps_a)
+        t1 = time.time()
+        t2 = time.time()
+        print numpy_version(x, steps_b)
+        t3 = time.time()
+        t_a = t1 - t0
+        t_b = t3 - t2
+
+        print "%s takes %f s/Kop" % (
+                'numpy',
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+
+    def time_linker(name, linker):
+        steps_a = 5
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x,steps_a)
+        b = build_graph(x,steps_b)
+
+
+        f_a = function([x], a,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_a speed test %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None, linker=linker()),
+                #profile='f_b speed test %s'%name,
+                )
+
+        print f_a([2.0, 3.0])
+        t0 = time.time()
+        print f_a([2.0, 3.0])
+        t1 = time.time()
+
+        print f_b([2.0, 3.0])
+
+        t2 = time.time()
+        print f_b([2.0, 3.0])
+        t3 = time.time()
+
+        t_a = t1 - t0
+        t_b = t3 - t2
+
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+
+    time_linker('c|py', OpWiseCLinker)
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_CLOOP', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+    time_numpy()
+
+def test_speed_lazy():
+
+    def build_graph(x, depth=5):
+        z = x
+        for d in range(depth):
+            z = cond(z> 0, -z, z)
+        return z
+
+    def time_linker(name, linker):
+        steps_a = 10
+        steps_b = 100
+        x = tensor.vector()
+        a = build_graph(x, steps_a)
+        b = build_graph(x, steps_b)
+
+
+        f_a = function([x], a,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_a lazy cond %s'%name,
+                )
+        f_b = function([x], b,
+                mode=Mode(optimizer=None,
+                    linker=linker()),
+                #profile='f_b lazy cond %s'%name,
+                )
+
+        print f_a([2.0])
+        t0 = time.time()
+        print f_a([2.0])
+        t1 = time.time()
+
+        print f_b([2.0])
+
+        t2 = time.time()
+        print f_b([2.0])
+        t3 = time.time()
+
+        t_a = t1 - t0
+        t_b = t3 - t2
+
+        print "%s takes %f s/Kop" % (
+                name,
+                (1000*(t_b-t_a) / (steps_b - steps_a)))
+
+    time_linker('vmLinker', vm.VM_Linker)
+    time_linker('vmLinker_nogc', lambda : vm.VM_Linker(allow_gc=False))
+    time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False,
+        use_cloop=True))
+
+run_memory_usage_tests = False
+if run_memory_usage_tests:
+    # these are not normal unit tests, do not run them as part of standard
+    # suite.  I ran them while looking at top, and stopped when memory usage was
+    # stable.
+    def test_leak2():
+        import theano.sandbox.cuda as cuda
+        for i in xrange(1000000):
+            n = numpy.asarray([2.3, 4.5], dtype='f')
+            c = sys.getrefcount(n)
+            a = cuda.CudaNdarray(n)
+            assert c == sys.getrefcount(n)
+            if not i % 1000:
+                print '.',
+                print gc.collect(),
+                print gc.collect()
+            sys.stdout.flush()
+
+    def test_no_leak_many_graphs():
+        # Verify no memory leaks when creating and deleting a lot of functions
+
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+        for i in xrange(10000):
+            x = tensor.vector()
+            z = x
+            for d in range(10):
+                z = tensor.sin(-z+ 1)
+
+            f = function([x], z, mode=Mode(optimizer=None, linker='cvm'))
+            if not i % 100:
+                print gc.collect()
+            sys.stdout.flush()
+
+            gc.collect()
+            if 1:
+                f([2.0])
+                f([3.0])
+                f([4.0])
+                f([5.0])
+
+    def test_no_leak_many_call_lazy():
+        # Verify no memory leaks when calling a function a lot of times
+
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = cond(z> 0, -z, z)
+            return z
+
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x, steps_a)
+
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+
+            for i in xrange(100000):
+                f_a([2.0])
+            if 0: # this doesn't seem to work, prints 0 for everything
+                import resource
+                pre = resource.getrusage(resource.RUSAGE_SELF)
+                post = resource.getrusage(resource.RUSAGE_SELF)
+                print pre.ru_ixrss, post.ru_ixrss
+                print pre.ru_idrss, post.ru_idrss
+                print pre.ru_maxrss, post.ru_maxrss
+
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
+
+    def test_no_leak_many_call_nonlazy():
+        # Verify no memory leaks when calling a function a lot of times
+
+        # This isn't really a unit test, you have to run it and look at top to see
+        # if there's a leak
+
+        def build_graph(x, depth=5):
+            z = x
+            for d in range(depth):
+                z = tensor.sin(-z+1)
+            return z
+
+        def time_linker(name, linker):
+            steps_a = 10
+            x = tensor.vector()
+            a = build_graph(x,steps_a)
+
+            f_a = function([x], a,
+                    mode=Mode(optimizer=None,
+                        linker=linker()))
+
+            for i in xrange(500000):
+                f_a([2.0])
+
+        time_linker('vmLinker_C', lambda : vm.VM_Linker(allow_gc=False, use_cloop=True))
+
+
--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
+"""
+VMs that run Theano graph computations.
+"""
+import sys
+import time
+import link
+import traceback
+from theano.gof.python25 import all
+
+import theano
+config = theano.config
+
+from theano.configparser import config, AddConfigVar, BoolParam
+from theano import config
+
+AddConfigVar('profile',
+        "If VM should collect profile information",
+        BoolParam(False))
+
+
+def raise_with_op(op, exc_info = None):
+    """WRITEME"""
+    if exc_info is None:
+        exc_info = sys.exc_info()
+    exc_type, exc_value, exc_trace = exc_info
+    if exc_type == KeyboardInterrupt:
+        # print a simple traceback from KeyboardInterrupt
+        raise exc_type, exc_value, exc_trace
+    try:
+        trace = op.tag.trace
+    except AttributeError:
+        trace = ()
+    exc_value.__thunk_trace__ = trace
+    exc_value.args += (op, )
+    if op in op.env.toposort():
+        exc_value.args += ('Sequence id of Apply node='+str(op.env.toposort().index(op)),)
+    raise exc_type, exc_value, exc_trace
+
+class VM(object):
+    """
+    A VM object evaluates a Theano program with its __call__ method.
+
+    Attributes:
+
+    call_counts - list of integers, one for each thunk. call_count[i] is the
+        number of times thunks[i] was called in the course of computations
+        performed by call_with_timers().
+
+    call_times - list of floats, one for each thunk. call_times[i] is the amount
+        of runtime spent on thunks[i] in the course of computations performed by
+        call_with_timers().
+
+    """
+    def __init__(self, nodes, thunks, pre_call_clear):
+        """
+        Allocate a virtual machine.
+
+        nodes - a list of nodes in toposort order
+
+        thunks - a list of thunks to execute those nodes, in toposort order
+
+        pre_call_clear - a list of containers to empty at the beginning of each
+                         call.
+        """
+        if len(nodes) != len(thunks):
+            raise ValueError()
+        self.nodes = nodes
+        self.thunks = thunks
+        self.pre_call_clear = pre_call_clear
+        self.call_counts = [0]*len(nodes)
+        self.call_times = [0]*len(nodes)
+        self.time_thunks = False
+
+    def __call__(self):
+        """
+        Run the machine.
+
+        Postcondition - all output variables have been computed.  VMs vary in
+        what exactly this means and how it is done.
+        """
+        raise NotImplementedError('override me')
+
+    def clear_storage(self):
+        """
+        Free any internal references to temporary variables.
+
+        Free internal variables and outputs.  Essentially, free as much memory
+        as possible without intefering with the ability to evaluate subsequent
+        calls.
+        """
+        raise NotImplementedError('override me')
+
+    def update_profile(self, profile):
+        # accumulate into the profile object
+        for node, thunk, t, c in zip(self.nodes, self.thunks, self.call_times, self.call_counts):
+            profile.apply_time.setdefault(node,0.0)
+            profile.apply_time[node] += t
+
+            profile.apply_callcount.setdefault(node,0)
+            profile.apply_callcount[node] = c
+
+            profile.apply_cimpl[node] = hasattr(thunk,'cthunk')
+
+        # clear the timer info out of the buffers
+        for i in range(len(self.call_times)):
+            self.call_times[i] = 0.0
+            self.call_counts[i] = 0
+
+class Loop(VM):
+    """
+    Unconditional start-to-finish program execution in Python.
+    No garbage collection is allowed on intermediate results.
+    """
+    def __call__(self):
+        if self.time_thunks:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for i, (thunk, node) in enumerate(zip(self.thunks, self.nodes)):
+                    t0 = time.time()
+                    thunk()
+                    t1 = time.time()
+                    self.call_counts[i] += 1
+                    self.call_times[i] += t1 - t0
+            except:
+                raise_with_op(node)
+        else:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for thunk, node in zip(self.thunks, self.nodes):
+                    thunk()
+            except:
+                raise_with_op(node)
+
+class LoopGC(VM):
+    """
+    Unconditional start-to-finish program execution in Python.
+    Garbage collection is possible on intermediate results.
+    """
+    def __init__(self, nodes, thunks, pre_call_clear, post_thunk_clear):
+        super(LoopGC, self).__init__(nodes, thunks, pre_call_clear)
+        self.post_thunk_clear = post_thunk_clear
+        if not (len(nodes) == len(thunks) == len(post_thunk_clear)):
+            raise ValueError()
+    def __call__(self):
+        if self.time_thunks:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                i = 0
+                for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
+                    t0 = time.time()
+                    thunk()
+                    t1 = time.time()
+                    self.call_counts[i] += 1
+                    self.call_times[i] += t1 - t0
+                    for old_s in old_storage:
+                        old_s[0] = None
+                    i += 1
+            except:
+                raise_with_op(node)
+        else:
+            for cont in self.pre_call_clear:
+                cont[0] = None
+            try:
+                for thunk, node, old_storage in zip(self.thunks, self.nodes, self.post_thunk_clear):
+                    thunk()
+                    for old_s in old_storage:
+                        old_s[0] = None
+            except:
+                raise_with_op(node)
+
+class Stack(VM):
+    """
+    Finish-to-start evalution order of thunks.
+
+    This supports lazy evaluation of subtrees and partial
+    computations of graphs when only some inputs have changed.
+
+    """
+
+    def __init__(self, nodes, thunks, pre_call_clear,
+            storage_map, compute_map,
+            env, allow_gc):
+        super(Stack, self).__init__(nodes, thunks, pre_call_clear)
+
+        self.allow_gc = allow_gc
+        self.message = ""
+        self.base_apply_stack = [o.owner for o in env.outputs if o.owner]
+        self.outputs = env.outputs
+        self.storage_map = storage_map
+        self.apply_time = {}
+        self.outputs_size = {}
+        self.compute_map = compute_map
+        self.node_idx = node_idx = {}
+
+        ords = env.orderings()
+
+        for i, node in enumerate(self.nodes):
+            node_idx[node] = i
+            self.apply_time[node]     = 0
+            self.outputs_size[node]   = []
+            node.destroy_dependencies = []
+            if node in ords:
+                for prereq in ords[node]:
+                    node.destroy_dependencies += prereq.outputs
+
+        dependencies = self.dependencies = {}
+        for k in storage_map:
+            dependencies[k] = []
+            if k.owner and k.clients:
+                ls = []
+                is_output = 0
+                for cl in k.clients:
+                    if cl[0] is not 'output':
+                        ls += cl[0].outputs
+                dependencies[k] += ls
+
+        if config.profile:
+            self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4, "t64": 8, "128": 16}
+            atexit.register(self.atexit_print_all)
+
+    def __call__(self):
+        storage_map = self.storage_map
+        compute_map = self.compute_map
+        thunks = self.thunks
+        dependencies = self.dependencies
+        for k in self.storage_map:
+            compute_map[k][0] = (k.owner is None)
+
+        # apply_stack contains nodes
+        apply_stack = list(self.base_apply_stack)
+        last_apply_stack_len = -1
+        ls = []
+        while apply_stack:
+            # Make sure something happened last time round.
+            # This is just a safety check to make sure the op is written correctly
+            # apply_stack should either decrease in length by one (a thunk successfully applied), or
+            # increase in length (added dependencies over and above the original).
+            # NB: this doesn't catch cycles (would be too expensive/slow), just stalls.
+            apply_stack_len = len(apply_stack)
+            assert apply_stack_len != last_apply_stack_len
+            last_apply_stack_len = apply_stack_len
+
+            current_apply = apply_stack.pop()
+
+            # Use these for loops + breaks to short circuit evaluation
+            # This is a significant performance point
+            computed_ins = True
+            for i in current_apply.inputs:
+                if not compute_map[i][0]:
+                    computed_ins = False
+                    break
+            computed_outs = True
+            for o in current_apply.outputs:
+                if not compute_map[o][0]:
+                    computed_outs = False
+                    break
+            if computed_ins:
+                for d in current_apply.destroy_dependencies:
+                    if not compute_map[d][0]:
+                        computed_ins = False
+                        break
+
+            if not thunks[self.node_idx[current_apply]].lazy:
+                # Check if all inputs are in place
+                # If so compute thunk and remove it from the apply_stack
+                # If not leave it in, and add to the apply_stack those that will
+                # produce you those inputs
+
+                if computed_ins and not computed_outs:
+                    try:
+                        t0 = time.time()
+                        thunks[self.node_idx[current_apply]]()
+                        if config.profile:
+                            dt = time.time() - t0
+                            self.apply_time[current_apply] += dt
+                            ## Computing the memory footprint of the the op
+                            # ?? What about inplace .. if the op is inplace
+                            # you don't actually ask for more memory!
+                            size = []
+                            for (idx,o) in enumerate(
+                                    thunks[self.node_idx[current_apply]].outputs):
+                                if not hasattr(o[0],'size'):
+                                    size.append(-1)
+                                    continue
+                                s=o[0].size
+                                dtype = str(o[0].dtype)
+                                dtype2 = dtype[-3:]
+                                s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
+                                size.append(s)
+                            self.outputs_size[current_apply] = size
+                    except Exception:
+                        raise_with_op(current_apply)
+                    for o in current_apply.outputs:
+                        compute_map[o][0] = 1
+                    # Garbage Collection -> check if anybody else uses this input
+                    if self.allow_gc:
+                        for i in current_apply.inputs:
+                            if (dependencies[i] and i.owner
+                                and i not in self.outputs):
+                                empty_storage_map = True
+                                for x in dependencies[i]:
+                                    if not compute_map[x][0]:
+                                        empty_storage_map = False
+                                        break
+                                if empty_storage_map:
+                                    storage_map[i][0] = None
+                elif not computed_ins:
+
+                    apply_stack.append(current_apply)
+                    apply_stack.extend(inp.owner for inp in current_apply.inputs if inp.owner)
+                    apply_stack.extend(inp.owner for inp in current_apply.destroy_dependencies if inp.owner)
+
+            elif not computed_outs:
+                # Try and run it to see if it works
+                try:
+                    t0 = time.time()
+                    requires = thunks[self.node_idx[current_apply]]()
+                    dt = time.time() - t0
+                    self.apply_time[current_apply] += dt
+
+                except Exception:
+                    raise_with_op(current_apply)
+
+                if requires:
+                    for r in requires:
+                        # We are not done with this op ..
+                        # so we added back and see to get the inputs we are missing
+                        apply_stack.append(current_apply)
+                        if current_apply.inputs[r].owner:
+                            apply_stack.append(current_apply.inputs[r].owner)
+
+
+                else:
+                    if config.profile:
+                        size = []
+                        for (idx,o) in enumerate(thunks[self.node_idx[current_apply]].outputs):
+                            if not hasattr(o[0],'size'):
+                                size.append(-1)
+                                continue
+                            s=o[0].size
+                            dtype = str(o[0].dtype)
+                            dtype2 = dtype[-2:]
+                            s *= memory_size_map[dtype2] # KeyError here: couldn't determine the dtype memory size
+                            size.append(s)
+                        self.outputs_size[current_apply] = size
+                    if self.allow_gc:
+                        for i in current_apply.inputs:
+                            if (dependencies[i] and i.owner and
+                                i not in self.outputs):
+                                empty_storage_map = True
+                                for x in dependencies[i]:
+                                    if not compute_map[x][0]:
+                                        empty_storage_map = False
+                                        break
+                                if empty_storage_map:
+                                    storage_map[i][0] = None
+
+try:
+    import lazylinker_c
+    class CVM(lazylinker_c.CLazyLinker, VM):
+        def __init__(self, *args, **kwargs):
+            lazylinker_c.CLazyLinker.__init__(self, *args, **kwargs)
+            # skip VM.__init__
+except ImportError:
+    pass
+
+class VM_Linker(link.LocalLinker):
+    """
+    Class that satisfies the Linker interface by acting as a VM factory.
+    """
+
+    def __init__(self, allow_gc=True, use_cloop = False):
+        self.env = None
+        self.allow_gc = allow_gc
+        self.use_cloop=use_cloop
+
+    def accept(self, env, no_recycling = []):
+        """
+        :param env: a PerformLinker can have accepted one Env instance at a time.
+
+        :param no_recycling: WRITEME
+
+        :returns: self (TODO: WHY? Who calls this function?)
+        """
+        if self.env is not None and self.env is not env:
+            return type(self)().accept(env, no_recycling)
+        self.env = env
+        self.no_recycling = no_recycling
+        return self
+
+    def make_vm(self, nodes, thunks,
+            input_storage, output_storage, storage_map,
+            post_thunk_clear,
+            computed,
+            compute_map
+            ):
+
+        pre_call_clear = [storage_map[v] for v in self.no_recycling]
+
+        if self.use_cloop:
+            # create a map from nodes to ints and vars to ints
+            nodes_idx = {}
+            vars_idx = {}
+            for i, node in enumerate(nodes):
+                nodes_idx[node] = i
+                for v in node.inputs + node.outputs:
+                    vars_idx.setdefault(v, len(vars_idx))
+            for v in self.env.inputs + self.env.outputs:
+                vars_idx.setdefault(v, len(vars_idx))
+
+            nodes_idx_inv = {}
+            vars_idx_inv = {}
+            for (node,i) in nodes_idx.items():
+                nodes_idx_inv[i] = node
+            for (var,i) in vars_idx.items():
+                vars_idx_inv[i] = var
+
+            # put storage_map and compute_map into a int-based scheme
+            n_applies = len(nodes)
+            storage_map_list = [storage_map[vars_idx_inv[i]]
+                    for i in range(len(vars_idx_inv))]
+            compute_map_list = [compute_map[vars_idx_inv[i]]
+                    for i in range(len(vars_idx_inv))]
+            if nodes:
+                assert type(storage_map_list[0]) is list
+                assert type(compute_map_list[0]) is list
+
+            # build the pointers to node inputs and offsets
+            base_input_output_list = []
+            node_n_inputs = []
+            node_n_outputs = []
+            node_input_offset = []
+            node_output_offset = []
+            for node in nodes:
+                inputs_idx = [vars_idx[v] for v in node.inputs]
+                outputs_idx = [vars_idx[v] for v in node.outputs]
+                node_n_inputs.append(len(inputs_idx))
+                node_n_outputs.append(len(outputs_idx))
+                node_input_offset.append(len(base_input_output_list))
+                base_input_output_list.extend(inputs_idx)
+                node_output_offset.append(len(base_input_output_list))
+                base_input_output_list.extend(outputs_idx)
+
+            # build the var owner array
+            var_owner = [None]*len(vars_idx)
+            for (var,i) in vars_idx.items():
+                if var.owner:
+                    var_owner[i] = nodes_idx[var.owner]
+
+            is_lazy_list = [int(th.lazy) for th in thunks]
+            output_vars = [vars_idx[v] for v in self.env.outputs]
+
+            # builds the list of prereqs induced by e.g. destroy_handler
+            ords = self.env.orderings()
+            node_prereqs = []
+            node_output_size = []
+            for i, node in enumerate(nodes):
+                node_output_size.append(0)
+                prereq_var_idxs = []
+                for prereq_node in ords.get(node,[]):
+                    prereq_var_idxs.extend(
+                            [vars_idx[v] for v in prereq_node.outputs])
+                prereq_var_idxs = list(set(prereq_var_idxs))
+                prereq_var_idxs.sort() # TODO: why sort?
+                node_prereqs.append(prereq_var_idxs)
+
+            c0 = sys.getrefcount(node_n_inputs)
+            vm = CVM(
+                    nodes,
+                    thunks,
+                    pre_call_clear,
+                    allow_gc=self.allow_gc,
+                    call_counts=[0]*len(nodes),
+                    call_times=[0.0]*len(nodes),
+                    compute_map_list=compute_map_list,
+                    base_input_output_list=base_input_output_list,
+                    node_n_inputs=node_n_inputs,
+                    node_n_outputs=node_n_outputs,
+                    node_input_offset=node_input_offset,
+                    node_output_offset=node_output_offset,
+                    var_owner=var_owner,
+                    is_lazy_list=is_lazy_list,
+                    output_vars=output_vars,
+                    node_prereqs=node_prereqs,
+                    node_output_size=node_output_size,
+                    )
+            assert c0 == sys.getrefcount(node_n_inputs)
+        else:
+            if all([(not th.lazy) for th in thunks]):
+                # there is no conditional in the graph
+                if self.allow_gc:
+                    vm = LoopGC(
+                            nodes,
+                            thunks,
+                            pre_call_clear,
+                            post_thunk_clear)
+                else:
+                    vm = Loop(
+                            nodes,
+                            thunks,
+                            pre_call_clear)
+            else:
+                vm = Stack(
+                        nodes, thunks, pre_call_clear,
+                        storage_map, compute_map,
+                        self.env, self.allow_gc
+                        )
+        return vm
+
+    def make_all(self, profiler = None, input_storage = None, output_storage = None):
+        env = self.env
+        order = list(env.toposort())
+        no_recycling = self.no_recycling
+
+        input_storage, output_storage, storage_map = link.map_storage(
+                env, order, input_storage, output_storage)
+        compute_map = {}
+        for k in storage_map:
+            compute_map[k] = [k.owner is None]
+
+        thunks = [node.op.make_thunk(node,
+                    storage_map,
+                    compute_map,
+                    no_recycling)
+                        for node in order]
+
+        computed, last_user = link.gc_helper(order)
+        if self.allow_gc:
+            post_thunk_clear = []
+            for node in order:
+                clear_after_this_thunk = []
+                for input in node.inputs:
+                    if ((input in computed)
+                            and (input not in env.outputs)
+                            and (node == last_user[input])):
+                        clear_after_this_thunk.append(storage_map[input])
+                post_thunk_clear.append(clear_after_this_thunk)
+        else:
+            post_thunk_clear = None
+
+
+        vm = self.make_vm(order, thunks,
+                input_storage, output_storage, storage_map,
+                post_thunk_clear,
+                computed,
+                compute_map
+                )
+
+        return (vm,
+                [link.Container(input, storage)
+                    for input, storage in zip(env.inputs, input_storage)],
+                [link.Container(output, storage, True)
+                    for output, storage in zip(env.outputs, output_storage)],
+                thunks,
+                order)
+