Merged

f9a67241 · Olivier Delalleau · ab249ff2 · 2dc07279 · f9a67241 · f9a67241
--- a/bin/theano-compiledir
+++ b/bin/theano-compiledir
 #!/usr/bin/env python
-from theano.gof import cc
 import sys
-compiledir = cc.get_compiledir()
+from theano.gof.cc import get_compiledir, get_module_cache
 if len(sys.argv) == 1:
-    print compiledir
+    print get_compiledir()
 elif sys.argv[1] in ('clear'):
-    cc.clear_compiledir()
+    get_module_cache().clear()
 else:
    print 'command "%s" not recognized' % sys.argv[1]
+    print 'Type "theano-cache" to print the cache location'
+    print 'Type "theano-cache clear" to erase the cache'
    sys.exit(1)
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
 """Provides `DebugMode`, an evaluation mode for debugging theano internals."""
 __docformat__ = "restructuredtext en"
-import time, copy, sys, copy_reg, gc
+import time, copy, sys, copy_reg, gc, os
 from StringIO import StringIO
 import numpy
@@ -298,7 +298,7 @@ class InvalidValueError(DebugModeError):
-def _debugprint(r, prefix='', depth=-1, done=None, file=sys.stdout):
+def debugprint(r, prefix='', depth=-1, done=None, file=sys.stdout):
    """Print the graph leading to `r` to given depth.
    :param r: Variable instance
@@ -322,7 +322,7 @@ def _debugprint(r, prefix='', depth=-1, done=None, file=sys.stdout):
        if id(a) not in done:
            done.add(id(a))
            for i in a.inputs:
-                _debugprint(i, prefix+'  ', depth=depth-1, done=done, file=file)
+                debugprint(i, prefix+'  ', depth=depth-1, done=done, file=file)
    else:
        #this is a variable
        print >> file, prefix, r, id(r)
@@ -772,12 +772,12 @@ class _VariableEquivalenceTracker(object):
                append_reason = False
        if append_reason:
-            # N.B. compute the _debugprint now, because future optimizations will change the
+            # N.B. compute the debugprint now, because future optimizations will change the
            # graph
            self.reasons[new_r].append((reason
                , r
-                , _debugprint(r, prefix='  ', depth=6, file=StringIO()).getvalue()
+                , debugprint(r, prefix='  ', depth=6, file=StringIO()).getvalue()
-                , _debugprint(new_r, prefix='  ',  depth=6, file=StringIO()).getvalue()))
+                , debugprint(new_r, prefix='  ',  depth=6, file=StringIO()).getvalue()))
            self.replaced_by[r].append((reason, new_r))
        if r in self.equiv:
@@ -857,28 +857,9 @@ class _Linker(gof.link.LocalLinker):
                    raise utils.MethodNotDefined()
                e = Env(*graph.clone(node.inputs, node.outputs))
                e.toposort = lambda: e.nodes #WARNING: STOCHASTIC ORDER 
+                #  Specifically... e.nodes is a set, but of only 1 element
-                if any(isinstance(input, graph.Value) for input in node.inputs):
-                    desc = None
-                else:
-                    desc = (node.op,
-                            tuple(input.type for input in node.inputs),
-                            tuple(input.type for input in node.inputs),
-                            tuple(output in no_recycling for output in node.outputs),
-                            tuple(node.inputs.count(input) for input in node.inputs))
-                try:
-                    cl = self.__cache__.get(desc)
-                except Exception, exc:
-                    #print >> sys.stderr, "INFO: failed to hash %s: %s. Node will not be cached." % (node, exc)
-                    cl = None
-                if cl is None:
                cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs, node.outputs) if r2 in no_recycling])
-                    if desc is not None:
-                        try:
-                            self.__cache__[desc] = cl
-                        except:
-                            pass
                thunk, node_input_filters, node_output_filters = cl.make_thunk(
                    input_storage = node_input_storage,
@@ -1371,27 +1352,27 @@ class DebugMode(Mode):
    """
-    stability_patience = 10
+    stability_patience = int(os.getenv('THEANO_DEBUGMODE_PATIENCE', 10))
    """
    When checking for the stability of optimization, recompile the graph this many times.
    """
-    check_c_code = True
+    check_c_code = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_C', 1)))
    """
    Should we evaluate (and check) the `c_code` implementations?
    """
-    check_py_code = True
+    check_py_code = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1)))
    """
    Should we evaluate (and check) the `perform` implementations?
    """
-    check_isfinite = True
+    check_isfinite = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1)))
    """
    Should we check for (and complain about) NaN/Inf ndarray elements?
    """
-    require_matching_strides = False
+    require_matching_strides = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 0)))
    """
    Should we check for (and complain about) Ops whose python and C outputs are ndarrays with
    different strides? (This can catch bugs, but is generally overly strict.)

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -102,7 +102,7 @@ class ProfileMode(Mode):
    def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
-        """ As print_summary, but print the absolute difference on two different profile mode.
+        """ As print_summary, but print the difference on two different profile mode.
        TODO: Also we don't print the Apply-wise summary as it don't work for now.
        TODO: make comparaison with gpu code.
@@ -119,7 +119,7 @@ class ProfileMode(Mode):
            for a,ta in a_time.items():
                r.setdefault(a,0)
                tb = b_time.pop(a,0)
-                r[a]+=abs(ta-tb)
+                r[a]+=ta-tb
            #they are missing in a
            for a,t in b_time.items():
@@ -133,7 +133,7 @@ class ProfileMode(Mode):
            for a,ta in a_time.items():
                tb = b_time.pop(a,0)
                if hasattr(a,'flops'):
-                    flops[a]=abs(a.flops*a_call[a]/ta - a.flops*b_call[a]/tb)/1e6
+                    flops[a]=a.flops*a_call[a]/ta - a.flops*b_call[a]/tb/1e6
            #they are missing in a
            for b,tb in b_time.items():
@@ -142,8 +142,8 @@ class ProfileMode(Mode):
            return flops
-        local_time = abs(self.local_time[0]-other.local_time[0])
+        local_time = self.local_time[0]-other.local_time[0]
-        compile_time = abs(self.compile_time-other.compile_time)
+        compile_time = self.compile_time-other.compile_time
        apply_time = diff_dict(self.apply_time, other.apply_time)
        apply_call = diff_dict(self.apply_call, other.apply_call)
        op_time = diff_dict(self.op_time, other.op_time)

--- a/theano/compile/sandbox/__init__.py
+++ b/theano/compile/sandbox/__init__.py
-from .sharedvalue import shared
+from .sharedvalue import shared, shared_constructor
 from .pfunc import pfunc
--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -36,16 +36,12 @@ import cmodule
 import logging
 _logger=logging.getLogger("theano.gof.cc")
 def info(*args):
-    #sys.stderr.write('INFO:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.info(' '.join(str(a) for a in args))
 def debug(*args):
-    #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.debug(' '.join(str(a) for a in args))
 def warning(*args):
-    sys.stderr.write('WARNING:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.warning(' '.join(str(a) for a in args))
 def error(*args):
-    sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.error(' '.join(str(a) for a in args))
 from theano.gof.callcache import CallCache
@@ -526,6 +522,8 @@ class CLinker(link.Linker):
        # List of arg names for use in struct_gen. Note the call to uniq: duplicate inputs
        # must only be passed once because they are mapped to the same name.
+        # Duplicates are defined by (a is b), rather than (a==b) since Constant instances can
+        # compare equal to equivalent Constant instances.
        args = []
        args += ["storage_%s" % symbol[variable] for variable in utils.uniq(self.inputs + self.outputs + self.orphans)]
@@ -783,15 +781,21 @@ class CLinker(link.Linker):
        ``a`` is the topological position of the input's owner (-1 for graph inputs),
        ``b`` is the index of the variable in the owner's output list.
-        The graph position of a Constant is defined as its signature.
+        The graph position of a Constant instance is defined as its signature, together with
+        two integers: the topological position of the first Apply using that Constant instance,
+        and the lowest index into that Apply's inputs that refers to that Constant.  (These two
+        integers are a surrogate for the id() of the Constant.  The integers are important
+        because merge-able constants have the same signature, but require separate containers
+        in C code.)
        If the Op of any Apply in the Env does not have c_code_cache_ok()==True, then this
        function raises a KeyError exception.
        """
        order = list(self.env.toposort())
-        env_inputs_set = dict((i, (-1, pos)) for pos, i in enumerate(self.env.inputs))
+        env_inputs_dict = dict((i, (-1, pos)) for pos, i in enumerate(self.env.inputs))
        env_computed_set = set()
+        constant_ids = dict()
        op_pos = {} # Apply -> topological position
        rval = ['CLinker.cmodule_key'] # will be cast to tuple on return
        rval.append(tuple(self.compile_args()))
@@ -802,11 +806,15 @@ class CLinker(link.Linker):
        # - an env input
        # - an output from a node in the Env
        # - a Constant
-        def graphpos(i):
+        def graphpos(i, topological_pos, i_idx):
            if isinstance(i, graph.Constant):
-                return i.signature()
+                if id(i) not in constant_ids:
-            elif i in env_inputs_set:
+                    constant_ids[id(i)] = (i.signature(), topological_pos, i_idx)
-                return env_inputs_set[i]
+                return constant_ids[id(i)]
+                #print 'SIGNATURE', i.signature()
+                #return i.signature()
+            elif i in env_inputs_dict:
+                return env_inputs_dict[i]
            else:
                if i.owner is None:
                    assert all( all(out is not None for out in o.outputs) for o in order)
@@ -814,15 +822,16 @@ class CLinker(link.Linker):
                    raise Exception('what is this?', (i, type(i), i.clients, self.env))
                return (op_pos[i.owner], i.owner.outputs.index(i))
-        for opos, o in enumerate(order):
+        for node_pos, node in enumerate(order):
-            version.append(o.op.c_code_cache_version_apply(o))
+            version.append(node.op.c_code_cache_version_apply(node))
-            for i in o.inputs:
+            for i in node.inputs:
                version.append(i.type.c_code_cache_version())
-            for i in o.outputs:
+            for o in node.outputs:
-                version.append(i.type.c_code_cache_version())
+                version.append(o.type.c_code_cache_version())
-            rval.append((o.op, tuple((i.type, graphpos(i)) for i in o.inputs)))
+            rval.append((node.op, tuple((i.type, graphpos(i, node_pos, ipos)) 
-            op_pos[o] = opos
+                for ipos,i in enumerate(node.inputs))))
-            env_computed_set.update(o.outputs)
+            op_pos[node] = node_pos
+            env_computed_set.update(node.outputs)
        for v in version:
            if not v: #one of the ops or types here is unversioned
@@ -855,12 +864,12 @@ class CLinker(link.Linker):
    def build_dynamic_module(self):
-        """Generate the code for this module, compile it, return the imported dynamic module.
+        """Return a cmodule.DynamicModule instance full of the code for our env.
        """
        self.code_gen()
        module_name = self.hash
-        cthunk = object() # dummy so weave can get the type
+        cthunk = object() # dummy so weave can get the type ##TODO: REMOVE ME
        mod = cmodule.DynamicModule(module_name)
        # The code of instantiate
@@ -998,6 +1007,11 @@ class OpWiseCLinker(link.LocalLinker):
    no_recycling can contain a list of Variables that belong to the env.
    If a Variable is in no_recycling, CLinker will clear the output storage
    associated to it prior to computation (to avoid reusing it).
+    :note: This is in a sense the 'default' linker for Theano.  The overhead of using the
+    OpWiseCLinker as compared with the CLinker is only noticeable for graphs of very small
+    tensors (such as 20 elements or less)
    """
    __cache__ = {}
@@ -1044,6 +1058,8 @@ class OpWiseCLinker(link.LocalLinker):
                try:
                    e = Env(*graph.clone(node.inputs, node.outputs))
+                    # TODO: 20090926 Replace this code with th cl = CLinker().... line.  Trust
+                    # ModuleCache for cache mechanism.
                    if any(isinstance(input, graph.Value) for input in node.inputs):
                        desc = None
                    else:

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -10,16 +10,12 @@ _logger=logging.getLogger("theano.gof.cmodule")
 _logger.setLevel(logging.WARN)
 def error(*args):
-    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.error("ERROR: "+' '.join(str(a) for a in args))
 def warning(*args):
-    #sys.stderr.write('WARNING:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.warning("WARNING: "+' '.join(str(a) for a in args))
 def info(*args):
-    #sys.stderr.write('INFO:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.info("INFO: "+' '.join(str(a) for a in args))
 def debug(*args):
-    #sys.stderr.write('DEBUG:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.debug("DEBUG: "+' '.join(str(a) for a in args))
 METH_VARARGS="METH_VARARGS"
@@ -470,7 +466,8 @@ class ModuleCache(object):
        """
        Clear all the elements of the cache
        """
-        return self.clear_old(-1.0)
+        self.clear_old(-1.0)
+        self.clear_unversioned()
    def clear_unversioned(self):
        """Delete unversioned dynamic modules from the internal dictionaries and from the
@@ -493,6 +490,17 @@ class ModuleCache(object):
                info("clear_unversioned removing cache dir", parent)
                _rmtree(parent)
+        for filename in os.listdir(self.dirname):
+            if filename.startswith('tmp'):
+                try:
+                    open(os.path.join(self.dirname, filename, 'key.pkl')).close()
+                    has_key = True
+                except IOError:
+                    has_key = False
+                if not has_key:
+                    info("clear_unversioned removing cache dir", filename)
+                    _rmtree(os.path.join(self.dirname, filename))
    def _on_atexit(self):
        self.refresh()
        self.clear_old()
@@ -514,6 +522,8 @@ def get_module_cache(dirname, force_fresh=None):
    if _module_cache is None:
        _module_cache = ModuleCache(dirname, force_fresh=force_fresh)
        atexit.register(_module_cache._on_atexit)
+    if _module_cache.dirname != dirname:
+        warning("Returning module cache instance with different dirname than you requested")
    return _module_cache
 def get_lib_extension():

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -187,28 +187,28 @@ class Scalar(Type):
         };
         """
        operator_eq = """
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_int8 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_int8>(const npy_int8 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_int16 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_int16>(const npy_int16 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_int32 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_int32>(const npy_int32 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_int64 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_int64>(const npy_int64 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_float32 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_float32>(const npy_float32 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const npy_float64 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<npy_float64>(const npy_float64 & y)
        { this->real=y; this->imag=0; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const theano_complex128 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<theano_complex128>(const theano_complex128 & y)
        { this->real=y.real; this->imag=y.imag; return *this; }
-        template <> %(mytype)s & %(mytype)s::operator =(const theano_complex64 & y)
+        template <> %(mytype)s & %(mytype)s::operator=<theano_complex64>(const theano_complex64 & y)
        { this->real=y.real; this->imag=y.imag; return *this; }
        """
@@ -219,7 +219,8 @@ class Scalar(Type):
                + operator_eq % dict(mytype='theano_complex64')
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)  #explicit T given in specialization of operator= lines.  This makes it compile with open64
+        #2,
 int8 = Scalar('int8')
@@ -666,7 +667,7 @@ class Mul(ScalarOp):
      retval = []
      for input in inputs:
        if input.type in grad_types:
-          retval += [mul(*([gz] + utils.difference(inputs, [input])))]
+          retval += [cast(mul(*([gz] + utils.difference(inputs, [input]))), input.type.dtype)]
        else:
          retval += [None]
@@ -849,6 +850,8 @@ class Cast(UnaryScalarOp):
        super(Cast, self).__init__(specific_out(o_type), name=name)
        self.o_type = o_type
        self.ctor = getattr(numpy, o_type.dtype)
+    def __str__(self):
+        return '%s{%s}' % (self.__class__.__name__, self.o_type.dtype)
    def impl(self, input):
        return self.ctor(input)
    def c_code(self, node, name, (x, ), (z, ), sub):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -4,6 +4,7 @@ __docformat__ = "restructuredtext en"
 import __builtin__
 import sys # for sys.maxint
+import os  # for getenv THEANO_CMP_SLOPPY
 import traceback #for overriding Op.__call__
 if sys.version_info >= (2,5):
  import functools
@@ -199,20 +200,33 @@ def _wrap_tensor_into_member(x):
    return compile.module.Member(constant(x))
 compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member)
-#If you change those value in test don't forget to put them back when the test end.
+if int(os.getenv('THEANO_CMP_SLOPPY', 0)):
-#Don't forget the case when the test fail.
+    # This environment variable is a quick-and-dirty way to get low-precision comparisons.
-float_atol = 1e-5
+    # For a more precise setting of these tolerances set them explicitly in your user code by
-float_rtol = 1e-3 #  Sensible??
+    # assigning, for example, "theano.tensor.basic.float32_atol = ..."
+    float32_atol = 1e-4
+    float32_rtol = 1e-3 
+    float64_rtol = 1e-4
+    float64_atol = 1e-3
+else:
+    #If you change those value in test don't forget to put them back when the test end.
+    #Don't forget the case when the test fail.
+    float32_atol = 1e-5
+    float32_rtol = 1e-3 
+    # defaults in numpy.allclose
+    float64_rtol = 1.0000000000000001e-05
+    float64_atol = 1e-8
 def _allclose(a, b):
    narrow = 'float32', 'complex64'
    if (str(a.dtype) in narrow) or (str(b.dtype) in narrow):
-        atol = float_atol
+        atol = float32_atol
-        rtol = float_rtol
+        rtol = float32_rtol
-        return numpy.allclose(a,b, atol=atol, rtol=rtol)
    else:
-        # keep defaults of in numpy.allclose
+        atol = float64_atol
-        return numpy.allclose(a,b)
+        rtol = float64_rtol
+    return numpy.allclose(a,b, atol=atol, rtol=rtol)
 class TensorType(Type):
    """Symbolic `Type` representing a numpy.ndarray value."""
@@ -756,13 +770,29 @@ class TensorVariable(Variable, _tensor_py_operators):
    """Subclass to add the tensor operators to the basic `Variable` class."""
 class TensorConstantSignature(tuple):
+    """A Signature object for comparing TensorConstant instances
+    An instance is a pair: (Type instance, ndarray).
+    """
    def __eq__(self, other):
-        (a, b), (x,y) = self, other
+        try:
+            (t0, d0), (t1,d1) = self, other
+        except:
+            return False
        #N.B. compare shape to ensure no broadcasting in ==
-        return (x == a) and (b.shape == y.shape) and (numpy.all(b == y)) 
+        return (t0 == t1) and (d0.shape == d1.shape) \
+                and (self.sum == other.sum) and (numpy.all(d0 == d1)) 
    def __hash__(self):
-        a, b = self
+        t, d = self
-        return hashtype(self) ^ hash(a) ^ hash(b.shape)
+        return hashtype(self) ^ hash(t) ^ hash(d.shape) ^ hash(self.sum)
+    def _get_sum(self):
+        try:
+            return self._sum
+        except:
+            self._sum = self[1].sum()
+        return self._sum
+    sum = property(_get_sum)
 class TensorConstant(Constant, _tensor_py_operators):
    """Subclass to add the tensor operators to the basic `Constant` class.
@@ -943,6 +973,9 @@ _cast_mapping = {'int8': _convert_to_int8,
 @constructor
 def cast(x, dtype):
    """Symbolically cast `x` to a Tensor of type `dtype`.""" 
+    _x = as_tensor_variable(x)
+    if _x.type.dtype == dtype:
+        return _x
    if x.type.dtype.startswith('complex') and not dtype.startswith('complex'):
        raise TypeError('Casting from complex to real is ambiguous: consider real(), imag(), angle() or abs()')
    return _cast_mapping[dtype](x)
@@ -1417,15 +1450,19 @@ def mean(input, axis = None):
    if str(input.dtype).startswith('int'):
        # we need to cast eventually anyway, and this helps
        # to prevents overflow
-        input = convert_to_float64(input)
+        input = cast(input, 'float64')
    s = sum(input, axis)
    shp = shape(input)
+    if input.dtype == 'float32':
+        shp = cast(shp, 'float32')
    if axis is None:
        axis = range(input.type.ndim)
    elif isinstance(axis, int):
        axis = [axis]
    for i in axis:
        s = s / shp[i]
+    if input.dtype.startswith('float'):
+        assert input.dtype == s.dtype
    return s
 @constructor
@@ -1587,6 +1624,12 @@ class Subtensor(Op):
    inputs array is the tensor x, followed by scalar integer variables.
    @todo: add support for advanced tensor indexing (in Subtensor_dx too).
+    The idx_list is a tuple similar in structure to the sort of key you might expect in numpy's
+    basic indexing mode.  It has one element for each explicitly named dimension.  In numpy, the elements
+    can be either  integers or slices containing integers and None.  In Subtensor, each element
+    can additionally be a Scalar instance, and slice components can also be Scalar instances
+    too.
    """
    e_invalid = 'The index list is longer than the number of dimensions of the tensor.'
    e_subslice = 'nested slicing is not supported'
@@ -1707,7 +1750,7 @@ class Subtensor(Op):
    def grad(self, inputs, (gz,)):
        x = inputs[0]
        rest = inputs[1:]
-        return [SetSubtensor(self.idx_list)(zeros_like(x), gz, *rest)] + [None] * len(rest)
+        return [IncSubtensor(self.idx_list)(zeros_like(x), gz, *rest)] + [None] * len(rest)
    def __eq__(self, other):
        return type(self) == type(other) and self.idx_list == other.idx_list
@@ -1794,13 +1837,14 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor), S
-class SetSubtensor(Op):
+class IncSubtensor(Op):
-    """Set just some elements of a larger TensorType.
+    """Increment a subtensor.
    This is like numpy's 
-        z[i,j,k] = <something> 
+        z[i,j,k] += <something> 
+    It is used internally to implement the gradient on SubTensor.
    """
    def __init__(self, idx_list, inplace=False):
@@ -1858,7 +1902,7 @@ class SetSubtensor(Op):
        broadcastable = [bc for p, bc in zip(padded, x.type.broadcastable) if isinstance(p, slice)]
        if y.type.broadcastable != tuple(broadcastable):
-            raise TypeError("Invalid broadcastable pattern for y in SetSubtensor.make_node")
+            raise TypeError("Invalid broadcastable pattern for y in IncSubtensor.make_node")
        input_types = Subtensor.collapse(idx_list, lambda entry: isinstance(entry, gof.Type))
        if len(inputs) != len(input_types):
@@ -1890,7 +1934,13 @@ class SetSubtensor(Op):
            cdata = cdata[0]
        if not self.inplace:
            x = x.copy()
-        x.__setitem__(cdata, y)
+        sub_x = x.__getitem__(cdata)
+        if sub_x.shape:
+            # we've sliced out an N-D tensor with N > 0
+            sub_x += y
+        else:
+            # scalar case
+            x.__setitem__(cdata, sub_x + y)
        out[0] = x
 def split(x, splits_size, n_splits, axis=0):
@@ -2543,12 +2593,15 @@ class Dot(Op):
    def grad(self, (x, y), (gz,)):
        if gz.type.ndim == 0:
-            return gz * y, gz * x
+            rval = gz * y, gz * x
-        if x.type.ndim == 1 and y.type.ndim > 1:
+        elif x.type.ndim == 1 and y.type.ndim > 1:
-            return dot(gz, y.T), outer(x.T, gz)
+            rval = dot(gz, y.T), outer(x.T, gz)
-        if x.type.ndim > 1 and y.type.ndim == 1:
+        elif x.type.ndim > 1 and y.type.ndim == 1:
-            return outer(gz, y.T), dot(x.T, gz) 
+            rval = outer(gz, y.T), dot(x.T, gz) 
-        return dot(gz, y.T), dot(x.T, gz)
+        else:
+            rval = dot(gz, y.T), dot(x.T, gz)
+        return cast(rval[0], x.dtype), cast(rval[1], y.dtype)
    def __str__(self):
        return "dot"
 dot = Dot()

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
 """Ops and optimizations for using BLAS function calls to evaluate linear algebra expressions"""
-import os, sys, traceback
+import os, sys, traceback, logging
 import numpy
 from theano.gof import (utils, Op, Apply, view_roots, PatternSub, DestroyHandler, 
@@ -17,6 +17,13 @@ from theano import compile  #to register the optimizer built by this file
 from theano.tensor.blas_headers import cblas_header_text, blas_header_text
+_logger = logging.getLogger('theano.tensor.blas')
+def debug(*msg): _logger.debug(' '.join(str(m) for m in msg))
+def info(*msg): _logger.info(' '.join(str(m) for m in msg))
+def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
+def warning(*msg): _logger.warning(' '.join(str(m) for m in msg))
+def error(*msg): _logger.error(' '.join(str(m) for m in msg))
 @utils.memoize
 def ldflags(libs=True, flags=False):
    """Return a list of libraries against which an Op's object file should be
@@ -655,6 +662,8 @@ def local_dot_to_dot22(node):
        x,y = node.inputs
        if _is_real_matrix(x) and y.type == x.type:
            return [_dot22(*node.inputs)]
+        else:
+            info('Not optimizing dot with inputs', x, y)
    else:
        return False
 register_specialize(local_dot_to_dot22)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -16,7 +16,6 @@ import itertools
 import sys
 from theano import compile  #to register the optimizer built by this file
-from theano.compile.debugmode import _debugprint
 from theano.gof.python25 import any, all
 # Utilities
@@ -292,13 +291,73 @@ def local_subtensor_make_vector(node):
 register_canonicalize(local_subtensor_make_vector)
+@register_canonicalize
+@gof.local_optimizer([None])
+def local_IncSubtensor_serialize(node):
+    """
+    When using Subtensor, gradient graphs can be ugly.
+    If we ask for grad(f(a[0]), a), we are going to get something like
+        IncSubtensor(Elemwise{second}(a, 0), g(f(a[0])), [0])
+    This might be ugly, but at least it's as fast as you could want.  If we ask for
+    grad(f(a[0], a[1], a[2]), a), it's much worse...
+        Elemwise{Add}
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[0])), [0])
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[1])), [1])
+            IncSubtensor(Elemwise{second}(a, 0), g(f(a[2])), [2])
+    This is much worse because this time we have to produce 3 matrices the size of 'a', just so
+    we can add them together. 
+    This Op rearranges IncSubtensor's that all work on the same initial argument (here,
+    Elemwise{second}(a,0)) into a chain.  The advantage of the chain structure is that each one
+    can be optimized later in the pipeline to operate inplace.
+    Ideally, the op will do something like this:
+    #
+    #  add(x, incsubtensor(b, c), incsubtensor(b, d))
+    #  -> incsubtensor(incsubtensor(add(x,b), c), d)
+    """
+    def movable(i):
+        # Return True iff this is a incsubtensor that we can move
+        return i.owner \
+                and isinstance(i.owner.op, T.IncSubtensor) \
+                and i.type == o_type \
+                and len(i.clients) == 1
+    if node.op == T.add:
+        o_type = node.outputs[0].type
+        movable_inputs = [i for i in node.inputs if movable(i)]
+        if movable_inputs:
+            new_inputs = [i for i in node.inputs if not movable(i)] \
+                    + [mi.owner.inputs[0] for mi in movable_inputs]
+            new_add = T.add(*new_inputs)
+            # stack up the new incsubtensors
+            tip = new_add
+            for mi in movable_inputs:
+                assert tip.type == o_type
+                assert tip.type == mi.owner.inputs[0].type
+                tip = mi.owner.op(tip, *mi.owner.inputs[1:])
+            return [tip]
+        #print incsub_inputs, [id(i.owner.inputs[0]) for i in incsub_inputs]
 #after priority 50 Destructive inplace operations
 #gemm is the first one now, at priority 70
 @gof.local_optimizer([None])
 def local_inplace_setsubtensor(node):
-    if isinstance(node.op, T.SetSubtensor) and not node.op.inplace:
+    if isinstance(node.op, T.IncSubtensor) and not node.op.inplace:
-        new_op = T.SetSubtensor(node.op.idx_list, inplace=True)
+        new_op = T.IncSubtensor(node.op.idx_list, inplace=True)
        new_node = new_op(*node.inputs)
        return [new_node]
    return False

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -852,7 +852,10 @@ class T_subtensor(unittest.TestCase):
        n = as_tensor_variable(data)
        t = n[1,0]
        gn = grad(sum(exp(t)), n)
-        gval = eval_outputs([gn])
+        f = function([], gn, mode=None)
+        print 'toposort', f.maker.env.toposort()
+        gval = f()
+        print gval
        good = numpy.zeros_like(data)
        good[1,0] = numpy.exp(data[1,0])
        self.failUnless(numpy.all(gval == good), (gval, good))