Merged

f517e1a0 · Olivier Delalleau · 18f73bc6 · 5ca02715 · f517e1a0 · f517e1a0
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
 }
 static PyObject *
-filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
+filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
 {
    /*
     * TODO: DOC what this function should do in the various cases of
@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
                Py_DECREF(rval);
                rval = NULL;
            }
+        }
        Py_DECREF(data);
        Py_DECREF(py_data);
        Py_DECREF(broadcastable);
-        }
        return (PyObject*)rval;
    }
 }
@@ -2490,6 +2490,11 @@ CudaNdarray_new_nd(int nd)
    return (PyObject *) rval;
 }
+/**
+ * Initialize 'self' as a view of 'base', with memory storage 'data'
+ */
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
 {
    if (self->data_allocated)

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -95,6 +95,12 @@ CudaNdarray_Check(const PyObject * ob);
 int
 CudaNdarray_CheckExact(const PyObject * ob);
+/**
+ * Return true for a C-contiguous CudaNdarray, else false
+ */
+bool
+CudaNdarray_is_c_contiguous(const CudaNdarray * self);
 /****
 * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
 */
@@ -386,13 +392,25 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
        size = size * dim[i];
    }
-    if (self->data_allocated != size)
+    if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
    {
-        if (device_free(self->devdata))
+        return 0;
+    }
+    // The structure of self will be reused with newly allocated memory.
+    // If self was a view, we should remove the reference to its base.
+    // (If base was already NULL, the following has no effect.)
+    Py_XDECREF(self->base);
+    self->base = NULL;
+    // If self is a view, do not try to free its memory
+    if (self->data_allocated && device_free(self->devdata))
    {
-            // Does this ever happen??  Do we need to set data_allocated or devdata to 0?
+        self->devdata = NULL;
+        self->data_allocated = 0;
        return -1;
    }
    assert(size>0);
    self->devdata = (float*)device_malloc(size*sizeof(real));
    if (!self->devdata)
@@ -408,7 +426,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
            self->devdata,
            self);
    self->data_allocated = size;
-    }
    return 0;
 }

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6,7 +6,6 @@
 import logging
 _logger = logging.getLogger('theano.tensor.opt')
-import copy
 import operator
 import itertools
 import sys
@@ -574,14 +573,6 @@ class ShapeFeature(object):
        if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
-        # NOTE: This may cause problems bacause the shape is not asserted
-        #       there is an equivalent mechanism to do this, namely
-        #       specify_shape that one should use
-        # If user provided size
-        #elif ( hasattr(r.tag,'shape') and
-        #      r.tag.shape is not None and
-        #      r.tag.shape[i] is not None):
-        #    return T.constant(copy.copy(r.tag.shape[i]),dtype='int64')
        else:
            return Shape_i(i).make_node(r).outputs[0]
@@ -1101,7 +1092,6 @@ def local_alloc_elemwise(node):
    return [node.op(*new)]
 #TODO, global optimizer that lift the assert to the beginning of the graph.
-#TODO, var.tag.shape to propagate the shape and lower the overhead of this op
 #TODO, when all inputs can be optimized do all except one
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
@@ -2749,13 +2739,7 @@ register_specialize(local_mul_specialize)
 @gof.local_optimizer([T.add])
 def local_add_specialize(node):
    def fill_chain(v):
-        # Not sure why this happens .. but I did not had the time to look
-        # into it, it probably has something to do with the dtype I'm
-        # providing the tag.shape of my variable
        out = _fill_chain(v, node.inputs)
-        if out[0].dtype != node.outputs[0].dtype:
-            return [T.cast(out[0], dtype = node.outputs[0].dtype)]
-        else:
        return out
    #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.