merge (ps: I hate mercurial)

3f650984 · gdesjardins · f49e38f7 · 3f650984 · 3f650984 · 3f650984
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -200,3 +200,9 @@ AddConfigVar('warn.sum_sum_bug',
 AddConfigVar('warn.sum_div_dimshuffle_bug',
             "Warn if previous versions of Theano (between rev. 3bd9b789f5e8, 2010-06-16, and cfc6322e5ad4, 2010-08-03) would have given incorrect result. This bug was triggered by sum of division of dimshuffled tensors.",
             BoolParam(default_0_3))
+AddConfigVar('compute_test_value',
+        "If True, Theano will run each op at graph build time, using Constants, SharedVariables and the tag 'test_value' as inputs to the function. This helps the user track down problems in the graph before it gets optimized.",
+        EnumStr(True, False, 'warn', 'err'))
--- a/theano/gof/apply_shape.py
+++ b/theano/gof/apply_shape.py
+"""Apply for use with Tensors that implements shape propagation via variable.tag.shape
+This is not used currently very used. It appear in some case, but I'm not sure it if work or if it is used by default.
+It could help the current system to make it detect problem earlier when contructing the graph instead of during optimization.
+"""
+import sys
+from theano import gof
+def ishape(v):
+    try:
+        return (True, v.tag.shape)
+    except AttributeError:
+        return (False, (None,)*v.type.ndim)
+class Apply(gof.Apply):
+    def __init__(self, op, inputs, outputs):
+        super(Apply, self).__init__(op, inputs, outputs)
+        if not inputs:
+            return
+        # if any input has any shape info, then propagate it
+        try:
+            provided, ishapes = zip(*[ishape(i) for i in inputs])
+        except AttributeError:
+            # i.type.ndim didn't make sense for some i
+            return
+        if provided == [False for i in inputs]:
+            # no input had a tag.shape
+            return
+        try:
+            infer_shape = op.infer_shape
+        except AttributeError:
+            # op has no infer_shape, that's fine
+            return
+        try:
+            oshapes = infer_shape(self, ishapes)
+        except NotImplementedError:
+            return
+        for o, oshp in zip(outputs, oshapes):
+            o.tag.shape = oshp
--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -8,9 +8,10 @@ compatible with `gof`'s :doc:`graph` routines.
 __docformat__ = "restructuredtext en"
+from .. import config
+import graph
+import numpy
 import utils
-from theano import config
 class CLinkerObject(object):
@@ -322,6 +323,46 @@ class PureOp(object):
        """
        node = self.make_node(*inputs, **kwargs)
        self.add_tag_trace(node)
+        if config.compute_test_value:
+            # avoid circular import
+            from ..compile.sharedvalue import SharedVariable
+            run_perform = True
+            # build test input-values
+            input_vals = []
+            for ins in inputs:
+                if isinstance(ins, graph.Constant):
+                    input_vals.append(ins.value)
+                elif isinstance(ins,numpy.ndarray):
+                    input_vals.append(ins)
+                elif isinstance(ins,SharedVariable):
+                    input_vals.append(ins.get_value(borrow=True))
+                elif isinstance(ins,graph.Variable) and hasattr(ins.tag, 'test_value'):
+                    input_vals.append(ins.tag.test_value)
+                else:
+                    # no test-value was specified, act accordingly
+                    if config.compute_test_value == 'warn':
+                        raise Warning('Cannot compute test value: input %s of Op %s missing default value')
+                        run_perform = False
+                    elif config.compute_test_value == 'err':
+                        raise ValueError('Cannot compute test value: input %s of Op %s missing default value')
+                    else:
+                        # silently skip test
+                        run_perform = False
+            # if all inputs have test-values, run the actual op
+            if run_perform:
+                # compute output value once with test inputs to validate graph
+                output_storage = [[None] * len(node.outputs)]
+                node.op.perform(node, input_vals, output_storage)
+                # add 'test_value' to output tags, so that downstream ops can use these
+                # numerical values as inputs to their perform method.
+                for (outval, node_output) in zip(output_storage, node.outputs):
+                    node_output.tag.test_value = outval[0]
        if self.default_output is not None:
            return node.outputs[self.default_output]
        else:

--- a/theano/gof/tests/test_compute_test_value.py
+++ b/theano/gof/tests/test_compute_test_value.py
+import numpy
+import unittest
+import theano
+from theano import tensor as T
+class TestComputeTestValue(unittest.TestCase):
+    def test_variable_only(self):
+        theano.config.compute_test_value = True
+        x = T.matrix('x')
+        x.tag.test_value = numpy.random.rand(3,4)
+        y = T.matrix('y')
+        y.tag.test_value = numpy.random.rand(4,5)
+        # should work
+        z = T.dot(x,y)
+        # this test should fail
+        y.tag.test_value = numpy.random.rand(6,5)
+        self.assertRaises(ValueError, T.dot, x, y)
+    def test_compute_flag(self):
+        x = T.matrix('x')
+        y = T.matrix('y')
+        y.tag.test_value = numpy.random.rand(4,5)
+        # should skip computation of test value
+        theano.config.compute_test_value = False
+        z = T.dot(x,y)
+        # should fail one or another when flag is set
+        theano.config.compute_test_value = 'warn'
+        self.assertRaises(Warning, T.dot, x, y)
+        theano.config.compute_test_value = 'err'
+        self.assertRaises(ValueError, T.dot, x, y)
+    def test_string_var(self):
+        theano.config.compute_test_value = True
+        x = T.matrix('x')
+        x.tag.test_value = numpy.random.rand(3,4)
+        y = T.matrix('y')
+        y.tag.test_value = numpy.random.rand(4,5)
+        z = theano.shared(numpy.random.rand(5,6))
+        # should work
+        out = T.dot(T.dot(x,y), z)
+        def f(x,y,z):
+            return T.dot(T.dot(x,y),z)
+        # this test should fail
+        z.set_value(numpy.random.rand(7,6))
+        self.assertRaises(ValueError, f, x, y, z)
+    def test_shared(self):
+        theano.config.compute_test_value = True
+        x = T.matrix('x')
+        x.tag.test_value = numpy.random.rand(3,4)
+        y = theano.shared(numpy.random.rand(4,6), 'y')
+        # should work
+        z = T.dot(x,y)
+        # this test should fail
+        y.set_value(numpy.random.rand(5,6))
+        self.assertRaises(ValueError, T.dot, x, y)
+    def test_ndarray(self):
+        theano.config.compute_test_value = True
+        x = numpy.random.rand(2,3)
+        y = theano.shared(numpy.random.rand(3,6), 'y')
+        # should work
+        z = T.dot(x,y)
+        # this test should fail
+        x = numpy.random.rand(2,4)
+        self.assertRaises(ValueError, T.dot, x, y)
+    def test_constant(self):
+        theano.config.compute_test_value = True
+        x = T.constant(numpy.random.rand(2,3))
+        y = theano.shared(numpy.random.rand(3,6), 'y')
+        # should work
+        z = T.dot(x,y)
+        # this test should fail
+        x = T.constant(numpy.random.rand(2,4))
+        self.assertRaises(ValueError, T.dot, x, y)
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
 }
 static PyObject *
-filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
+filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
 {
    /*
     * TODO: DOC what this function should do in the various cases of
@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
                Py_DECREF(rval);
                rval = NULL;
            }
+            Py_DECREF(data);
+            Py_DECREF(py_data);
+            Py_DECREF(broadcastable);
        }
-        Py_DECREF(data);
-        Py_DECREF(py_data);
-        Py_DECREF(broadcastable);
        return (PyObject*)rval;
    }
 }
@@ -2490,11 +2490,6 @@ CudaNdarray_new_nd(int nd)
    return (PyObject *) rval;
 }
-/**
- * Initialize 'self' as a view of 'base', with memory storage 'data'
- */
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
 {
    if (self->data_allocated)

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -26,7 +26,7 @@ typedef float real;
 #endif
-#ifndef SHARED_SIZE
+#ifndef SHARED_SIZE 
 #define SHARED_SIZE (16*1024)
 #endif
@@ -48,10 +48,10 @@ static T ceil_intdiv(T a, T b)
 /**
 * struct CudaNdarray
 *
- * This is a Python type.
+ * This is a Python type.  
 *
 */
-struct CudaNdarray
+struct CudaNdarray 
 {
    PyObject_HEAD
@@ -65,46 +65,40 @@ struct CudaNdarray
    /* Type-specific fields go here. */
    //GpuTensorType::VoidTensor * vt;
    int nd; //the number of dimensions of the tensor
-    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
+	// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
    int data_allocated; //the number of bytes allocated for devdata
    //device pointers (allocated by cudaMalloc)
    int dev_structure_fresh;
-    //dev_structure should be accessed via macros, otherwise may not be synchronized
+	//dev_structure should be accessed via macros, otherwise may not be synchronized
-    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
+    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...  
    real* devdata; //pointer to data element [0,..,0].
 };
 /*
 * Return a CudaNdarray whose 'nd' dimensions are all 0.
 */
-PyObject *
+PyObject * 
 CudaNdarray_New(int nd=-1);
 /**
 * Return 1 for a CudaNdarray otw 0
 */
-int
+int 
 CudaNdarray_Check(const PyObject * ob);
 /**
 * Return 1 for a CudaNdarray otw 0
 */
-int
+int 
 CudaNdarray_CheckExact(const PyObject * ob);
-/**
- * Return true for a C-contiguous CudaNdarray, else false
- */
-bool
-CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 /****
 * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
 */
-int
+int 
 cnda_structure_size(int nd)
 {
    // dim0, dim1, ...
@@ -113,23 +107,23 @@ cnda_structure_size(int nd)
    return nd + nd + nd;
 }
-const int *
+const int * 
 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
 {
    return self->host_structure;
 }
-const int *
+const int * 
 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
 {
    return self->host_structure + self->nd;
 }
-const int *
+const int * 
 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
 {
    return self->host_structure + 2*self->nd;
 }
-void
+void 
 cnda_mark_dev_structure_dirty(CudaNdarray * self)
 {
    self->dev_structure_fresh = 0;
@@ -196,7 +190,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
 *
 *  Does not sync structure to host.
 */
-void
+void 
 CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
 {
    if ((idx >= self->nd) || (idx < 0) || (d < 0))
@@ -212,7 +206,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
        cnda_mark_dev_structure_dirty(self);
    }
 }
-void
+void 
 CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 {
    if ((idx >= self->nd) || (idx < 0))
@@ -231,7 +225,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 *
 *  This means: recalculate the log2dims and transfer structure to the card
 */
-int
+int 
 cnda_copy_structure_to_device(CudaNdarray * self)
 {
    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
@@ -245,7 +239,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
    return 0;
 }
-const int *
+const int * 
 CudaNdarray_DEV_DIMS(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -255,7 +249,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
    }
    return self->dev_structure;
 }
-const int *
+const int * 
 CudaNdarray_DEV_STRIDES(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -265,7 +259,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
    }
    return self->dev_structure + self->nd;
 }
-const int *
+const int * 
 CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -275,7 +269,7 @@ CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
    }
    return self->dev_structure + 2*self->nd;
 }
-float *
+float * 
 CudaNdarray_DEV_DATA(const CudaNdarray * self)
 {
    return self->devdata;
@@ -284,7 +278,7 @@ CudaNdarray_DEV_DATA(const CudaNdarray * self)
 /**
 * Return the number of elements in the ndarray (product of the dimensions)
 */
-int
+int 
 CudaNdarray_SIZE(const CudaNdarray *self)
 {
    if (self->nd == -1) return 0;
@@ -295,7 +289,7 @@ CudaNdarray_SIZE(const CudaNdarray *self)
    }
    return size;
 }
-static PyObject *
+static PyObject * 
 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
 {
    return PyInt_FromLong(CudaNdarray_SIZE(self));
@@ -326,7 +320,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
            }
            self->dev_structure = NULL;
        }
-        if (self->host_structure)
+        if (self->host_structure) 
        {
            free(self->host_structure);
            self->host_structure = NULL;
@@ -392,41 +386,29 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
        size = size * dim[i];
    }
-    if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
+    if (self->data_allocated != size)
-    {
-        return 0;
-    }
-    // The structure of self will be reused with newly allocated memory.
-    // If self was a view, we should remove the reference to its base.
-    // (If base was already NULL, the following has no effect.)
-    Py_XDECREF(self->base);
-    self->base = NULL;
-    // If self is a view, do not try to free its memory
-    if (self->data_allocated && device_free(self->devdata))
-    {
-        self->devdata = NULL;
-        self->data_allocated = 0;
-        return -1;
-    }
-    assert(size>0);
-    self->devdata = (float*)device_malloc(size*sizeof(real));
-    if (!self->devdata)
    {
-        CudaNdarray_set_nd(self,-1);
+        if (device_free(self->devdata))
-        self->data_allocated = 0;
+        {
-        self->devdata = 0;
+            // Does this ever happen??  Do we need to set data_allocated or devdata to 0?
-        return -1;
+            return -1;
+        }
+        assert(size>0);
+        self->devdata = (float*)device_malloc(size*sizeof(real));
+        if (!self->devdata)
+        {
+            CudaNdarray_set_nd(self,-1);
+            self->data_allocated = 0;
+            self->devdata = 0;
+            return -1;
+        }
+        if (0)
+            fprintf(stderr,
+                "Allocated devdata %p (self=%p)\n",
+                self->devdata,
+                self);
+        self->data_allocated = size;
    }
-    if (0)
-        fprintf(stderr,
-            "Allocated devdata %p (self=%p)\n",
-            self->devdata,
-            self);
-    self->data_allocated = size;
    return 0;
 }
@@ -434,7 +416,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
 * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
 */
 template<typename inttype>
-PyObject *
+PyObject * 
 CudaNdarray_NewDims(int nd, const inttype * dims)
 {
    CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
@@ -458,7 +440,7 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
 {
-    return CudaNdarray_set_device_data(self, data, (PyObject *) base);
+  return CudaNdarray_set_device_data(self, data, (PyObject *) base);
 }
 /**
@@ -493,10 +475,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
 */
-PyObject *
+PyObject * 
 CudaNdarray_CreateArrayObj(CudaNdarray * self);
-PyObject *
+PyObject * 
 CudaNdarray_ZEROS(int n, int * dims);
 /**
@@ -517,7 +499,7 @@ int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pat
 void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 {
    fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
-            self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
+	    self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
    fprintf(fd, "\tHOST_DIMS:      ");
    for (int i = 0; i < self->nd; ++i)
    {
@@ -528,23 +510,23 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
    {
        fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
    }
    int data=0;
    fprintf(fd, "\n\tDEV_DIMS:      ");
    for (int i = 0; i < self->nd; ++i)
    {
        cublasGetVector(1, sizeof(int),
-                        self->dev_structure+i, 1,
+			self->dev_structure+i, 1,
-                        &data, 1);
+			&data, 1);
-        fprintf(fd, "%i\t", data);
+	fprintf(fd, "%i\t", data);
    }
    fprintf(fd, "\n\tDEV_STRIDES: ");
    for (int i = 0; i < self->nd; ++i)
    {
        cublasGetVector(1, sizeof(int),
-                        self->dev_structure + self->nd+i, 1,
+			self->dev_structure + self->nd+i, 1,
-                        &data, 1);
+			&data, 1);
-        fprintf(fd, "%i \t", data);
+	fprintf(fd, "%i \t", data);
    }
    fprintf(fd, "\n");
 }

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -12,7 +12,8 @@ import numpy, theano
 #from copy import copy as python_copy
 from theano import gof, shared
-from theano.gof import Apply, Constant, Op, Type, Value, Variable
+from theano.gof import Variable, Op, Type, Constant,  Value
+from theano.gof.apply_shape import Apply
 from theano import gradient
@@ -286,6 +287,7 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
                    TensorType(dtype = x_.dtype, broadcastable = bcastable),
                    x_.copy(),
                    name=name)
+            rval.tag.shape = x_.shape
            return rval
        else:
            # leave the shape out of the type
@@ -2976,6 +2978,15 @@ class SubtensorPrinter:
 pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor), SubtensorPrinter())
+def setsubtensor(x, y, idx_list, inplace=False):
+    print >> sys.stderr, "tensor.setsubtensor is deprecated - please use set_subtensor"
+    the_op = IncSubtensor(idx_list, inplace, set_instead_of_inc=True)
+    return the_op(x, y, *Subtensor.collapse(idx_list, lambda entry: isinstance(entry, Variable)))
+def incsubtensor(x, y, idx_list, inplace=False):
+    print >> sys.stderr, "tensor.incsubtensor is deprecated - please use inc_subtensor"
+    the_op = IncSubtensor(idx_list, inplace, set_instead_of_inc=False)
+    return the_op(x, y, *Subtensor.collapse(idx_list, lambda entry: isinstance(entry, Variable)))
 def set_subtensor(x, y, inplace=False):
    """Return x with the given subtensor overwritten by y.
@@ -3499,12 +3510,25 @@ class Join(Op):
    def infer_shape(self, node, ishapes):
-        # ishapes[0] contains the size of the axis on which we join
+        # Join op should get at least two inputs to join
-        # Join op should get at least one input to join
        assert len(ishapes) > 1
+        # Not sure this is needed anymore :( ... basically the apply_shape
+        # version of the apply node (i.e. the one defined in
+        # gof/apply_shape) calls infer_shape methods passing None to unknown
+        # inputs. It can handle NotImplementedError, so for now I just raise
+        # that whenever I get a None. Should we just remove gof/apply_shape
+        # if it is depricated ??
+        if ishapes[1] is None:
+            raise NotImplementedError
        n_dim = len(ishapes[1])
        for shape in ishapes[1:]:
-            assert shape is not None
+            if shape is None:
+                raise NotImplementedError
+            for shape_i in shape:
+                if shape_i is None:
+                    raise NotImplementedError
+            # at this point the inputs have been broadcasted so they should
+            # all have the same shape
            assert len(shape) == n_dim
        out_shapes = []
@@ -3822,6 +3846,9 @@ def reshape(x, newshape, ndim=None, name=None):
        ndim = get_vector_length(newshape)
    op = Reshape(ndim, name)
    rval = op(x, newshape)
+    if  isinstance(newshape, (list, tuple)):
+        rval.tag.shape = newshape
    return rval
 class Flatten(Op):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -6,13 +6,16 @@ import numpy.distutils
 from theano.configparser import config, AddConfigVar, StrParam
 from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler,
        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer,
-        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer, Apply)
+        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
 from theano.printing import pprint, FunctionPrinter, debugprint
 from theano.compile.mode import optdb
 from theano.gof.python25 import all, any
 import theano.scalar
 import basic as T
+from theano.gof.apply_shape import Apply
 #NB: this clobbers the builtin 'compile' symbol
 from theano import compile  #to register the optimizer built by this file

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -5,11 +5,12 @@ import numpy
 import elemwise_cgen as cgen
 import theano
 from theano import gof
-from theano.gof import Apply, Op
+from theano.gof import Op
 from theano import scalar
 from theano.scalar import Scalar
 from theano.printing import pprint
 from theano.gof.python25 import all, any
+from theano.gof.apply_shape import Apply
 # tensor depends on elemwise to provide definitions for several ops

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -18,7 +18,7 @@ import theano
 from theano.tensor import (as_tensor_variable, blas, get_constant_value,
        patternbroadcast)
 from theano import Op, config
-from theano.gof import Apply
+from theano.gof.apply_shape import Apply
 from theano.gof.python25 import any
 imported_scipy_signal = False

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -11,7 +11,7 @@ from theano.tensor import basic as tensor
 from theano.tensor import elemwise, dmatrix, fmatrix, dvector, fvector
 from theano.tensor import opt
 from theano.compile import optdb
-from theano.gof import Apply
+from theano.gof.apply_shape import Apply
 from theano.tensor.nnet.sigm import sigmoid, softplus

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6,6 +6,7 @@
 import logging
 _logger = logging.getLogger('theano.tensor.opt')
+import copy
 import operator
 import itertools
 import sys
@@ -572,6 +573,14 @@ class ShapeFeature(object):
        if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
+        # NOTE: This may cause problems bacause the shape is not asserted
+        #       there is an equivalent mechanism to do this, namely
+        #       specify_shape that one should use
+        # If user provided size
+        #elif ( hasattr(r.tag,'shape') and
+        #      r.tag.shape is not None and
+        #      r.tag.shape[i] is not None):
+        #    return T.constant(copy.copy(r.tag.shape[i]),dtype='int64')
        else:
            return Shape_i(i).make_node(r).outputs[0]
@@ -1084,6 +1093,7 @@ def local_alloc_elemwise(node):
    return [node.op(*new)]
 #TODO, global optimizer that lift the assert to the beginning of the graph.
+#TODO, var.tag.shape to propagate the shape and lower the overhead of this op
 #TODO, when all inputs can be optimized do all except one
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
@@ -2731,8 +2741,14 @@ register_specialize(local_mul_specialize)
 @gof.local_optimizer([T.add])
 def local_add_specialize(node):
    def fill_chain(v):
+        # Not sure why this happens .. but I did not had the time to look
+        # into it, it probably has something to do with the dtype I'm
+        # providing the tag.shape of my variable
        out = _fill_chain(v, node.inputs)
-        return out
+        if out[0].dtype != node.outputs[0].dtype:
+            return [T.cast(out[0], dtype = node.outputs[0].dtype)]
+        else:
+            return out
    #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
    if node.op == T.add:

--- a/theano/tensor/tests/test_incsubtensor.py
+++ b/theano/tensor/tests/test_incsubtensor.py
+import numpy
+import unittest
+from theano.tests import unittest_tools as utt
+import theano
+import theano.tensor as T
+class Test_incsubtensor(unittest.TestCase):
+    """Partial testing.
+    What could be tested:
+    - increment vs set
+    - thing incremented: scalar, vector, matrix,
+    - increment/set: constant, scalar, vector, matrix
+    - indices: scalar vs slice, constant vs variable, out of bound, ...
+    - inplace
+    """
+    def setUp(self):
+        utt.seed_rng()
+    def test_simple_ok(self):
+        """Increments or sets part of a tensor by a scalar using full slice and
+        a partial slice depending on a scalar.
+        """
+        a = T.dmatrix()
+        increment = T.dscalar()
+        sl1 = slice(None)
+        sl2_end = T.lscalar()
+        sl2 = slice(sl2_end)
+        for do_set in [False,True]:
+            if do_set:
+                resut = T.setsubtensor(a, increment, [sl1, sl2])
+            else:
+                resut = T.incsubtensor(a, increment, [sl1, sl2])
+            f = theano.function([a, increment, sl2_end], resut)
+            val_a = numpy.ones((5,5))
+            val_inc = 2.3
+            val_sl2_end = 2
+            result = f(val_a, val_inc, val_sl2_end)
+            expected_result = numpy.copy(val_a)
+            if do_set:
+                expected_result[:,:val_sl2_end] = val_inc
+            else:
+                expected_result[:,:val_sl2_end] += val_inc
+            self.assertTrue(numpy.array_equal(result, expected_result))
+        return
+    def test_grad(self):
+        a = T.dvector()
+        b = T.dvector()
+        def inc_slice(*s):
+            def just_numeric_args(a,b):
+                return T.incsubtensor(a, b, s)
+            return just_numeric_args
+        # vector
+        utt.verify_grad(
+                inc_slice(slice(2,4,None)),
+                (numpy.asarray([0,1,2,3,4,5.]),
+                    numpy.asarray([9,9.]),))
+        # matrix
+        utt.verify_grad(
+                inc_slice(slice(1,2,None), slice(None, None, None)),
+                (numpy.asarray([[0,1],[2,3],[4,5.]]),
+                    numpy.asarray([[9,9.]]),))
+        #single element
+        utt.verify_grad(
+                inc_slice(2, 1),
+                (numpy.asarray([[0,1],[2,3],[4,5.]]),
+                    numpy.asarray(9.),))