Merged

f517e1a0 · Olivier Delalleau · 18f73bc6 · 5ca02715 · f517e1a0 · f517e1a0
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -2188,7 +2188,7 @@ CudaNdarray_Dot(PyObject* _unused, PyObject* args)
 }

 static PyObject *
-filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
+filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
 {
    /*
     * TODO: DOC what this function should do in the various cases of
@@ -2282,10 +2282,10 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
                Py_DECREF(rval);
                rval = NULL;
            }
-            Py_DECREF(data);
-            Py_DECREF(py_data);
-            Py_DECREF(broadcastable);
        }
+        Py_DECREF(data);
+        Py_DECREF(py_data);
+        Py_DECREF(broadcastable);
        return (PyObject*)rval;
    }
 }
@@ -2490,6 +2490,11 @@ CudaNdarray_new_nd(int nd)
    return (PyObject *) rval;
 }

+
+/**
+ * Initialize 'self' as a view of 'base', with memory storage 'data'
+ */
+
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
 {
    if (self->data_allocated)

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -26,7 +26,7 @@ typedef float real;
 #endif


-#ifndef SHARED_SIZE 
+#ifndef SHARED_SIZE
 #define SHARED_SIZE (16*1024)
 #endif

@@ -48,10 +48,10 @@ static T ceil_intdiv(T a, T b)
 /**
 * struct CudaNdarray
 *
- * This is a Python type.  
+ * This is a Python type.
 *
 */
-struct CudaNdarray 
+struct CudaNdarray
 {
    PyObject_HEAD

@@ -65,40 +65,46 @@ struct CudaNdarray
    /* Type-specific fields go here. */
    //GpuTensorType::VoidTensor * vt;
    int nd; //the number of dimensions of the tensor
-	// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
+    // Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES macros
    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
    int data_allocated; //the number of bytes allocated for devdata


    //device pointers (allocated by cudaMalloc)
    int dev_structure_fresh;
-	//dev_structure should be accessed via macros, otherwise may not be synchronized
-    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...  
+    //dev_structure should be accessed via macros, otherwise may not be synchronized
+    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
    real* devdata; //pointer to data element [0,..,0].
 };

 /*
 * Return a CudaNdarray whose 'nd' dimensions are all 0.
 */
-PyObject * 
+PyObject *
 CudaNdarray_New(int nd=-1);

 /**
 * Return 1 for a CudaNdarray otw 0
 */
-int 
+int
 CudaNdarray_Check(const PyObject * ob);

 /**
 * Return 1 for a CudaNdarray otw 0
 */
-int 
+int
 CudaNdarray_CheckExact(const PyObject * ob);

+/**
+ * Return true for a C-contiguous CudaNdarray, else false
+ */
+bool
+CudaNdarray_is_c_contiguous(const CudaNdarray * self);
+
 /****
 * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
 */
-int 
+int
 cnda_structure_size(int nd)
 {
    // dim0, dim1, ...
@@ -107,23 +113,23 @@ cnda_structure_size(int nd)
    return nd + nd + nd;
 }

-const int * 
+const int *
 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
 {
    return self->host_structure;
 }
-const int * 
+const int *
 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
 {
    return self->host_structure + self->nd;
 }
-const int * 
+const int *
 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
 {
    return self->host_structure + 2*self->nd;
 }

-void 
+void
 cnda_mark_dev_structure_dirty(CudaNdarray * self)
 {
    self->dev_structure_fresh = 0;
@@ -190,7 +196,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
 *
 *  Does not sync structure to host.
 */
-void 
+void
 CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
 {
    if ((idx >= self->nd) || (idx < 0) || (d < 0))
@@ -206,7 +212,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
        cnda_mark_dev_structure_dirty(self);
    }
 }
-void 
+void
 CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 {
    if ((idx >= self->nd) || (idx < 0))
@@ -225,7 +231,7 @@ CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 *
 *  This means: recalculate the log2dims and transfer structure to the card
 */
-int 
+int
 cnda_copy_structure_to_device(CudaNdarray * self)
 {
    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
@@ -239,7 +245,7 @@ cnda_copy_structure_to_device(CudaNdarray * self)
    return 0;
 }

-const int * 
+const int *
 CudaNdarray_DEV_DIMS(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -249,7 +255,7 @@ CudaNdarray_DEV_DIMS(CudaNdarray * self)
    }
    return self->dev_structure;
 }
-const int * 
+const int *
 CudaNdarray_DEV_STRIDES(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -259,7 +265,7 @@ CudaNdarray_DEV_STRIDES(CudaNdarray * self)
    }
    return self->dev_structure + self->nd;
 }
-const int * 
+const int *
 CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
 {
    if (!self->dev_structure_fresh)
@@ -269,7 +275,7 @@ CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
    }
    return self->dev_structure + 2*self->nd;
 }
-float * 
+float *
 CudaNdarray_DEV_DATA(const CudaNdarray * self)
 {
    return self->devdata;
@@ -278,7 +284,7 @@ CudaNdarray_DEV_DATA(const CudaNdarray * self)
 /**
 * Return the number of elements in the ndarray (product of the dimensions)
 */
-int 
+int
 CudaNdarray_SIZE(const CudaNdarray *self)
 {
    if (self->nd == -1) return 0;
@@ -289,7 +295,7 @@ CudaNdarray_SIZE(const CudaNdarray *self)
    }
    return size;
 }
-static PyObject * 
+static PyObject *
 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
 {
    return PyInt_FromLong(CudaNdarray_SIZE(self));
@@ -320,7 +326,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
            }
            self->dev_structure = NULL;
        }
-        if (self->host_structure) 
+        if (self->host_structure)
        {
            free(self->host_structure);
            self->host_structure = NULL;
@@ -386,29 +392,41 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
        size = size * dim[i];
    }

-    if (self->data_allocated != size)
+    if (CudaNdarray_is_c_contiguous(self) && (self->data_allocated == size))
    {
-        if (device_free(self->devdata))
-        {
-            // Does this ever happen??  Do we need to set data_allocated or devdata to 0?
-            return -1;
-        }
-        assert(size>0);
-        self->devdata = (float*)device_malloc(size*sizeof(real));
-        if (!self->devdata)
-        {
-            CudaNdarray_set_nd(self,-1);
-            self->data_allocated = 0;
-            self->devdata = 0;
-            return -1;
-        }
-        if (0)
-            fprintf(stderr,
-                "Allocated devdata %p (self=%p)\n",
-                self->devdata,
-                self);
-        self->data_allocated = size;
+        return 0;
+    }
+
+    // The structure of self will be reused with newly allocated memory.
+    // If self was a view, we should remove the reference to its base.
+    // (If base was already NULL, the following has no effect.)
+    Py_XDECREF(self->base);
+    self->base = NULL;
+
+    // If self is a view, do not try to free its memory
+    if (self->data_allocated && device_free(self->devdata))
+    {
+        self->devdata = NULL;
+        self->data_allocated = 0;
+        return -1;
    }
+
+    assert(size>0);
+    self->devdata = (float*)device_malloc(size*sizeof(real));
+    if (!self->devdata)
+    {
+        CudaNdarray_set_nd(self,-1);
+        self->data_allocated = 0;
+        self->devdata = 0;
+        return -1;
+    }
+    if (0)
+        fprintf(stderr,
+            "Allocated devdata %p (self=%p)\n",
+            self->devdata,
+            self);
+    self->data_allocated = size;
+
    return 0;
 }

@@ -416,7 +434,7 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype
 * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
 */
 template<typename inttype>
-PyObject * 
+PyObject *
 CudaNdarray_NewDims(int nd, const inttype * dims)
 {
    CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
@@ -440,7 +458,7 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
 {
-  return CudaNdarray_set_device_data(self, data, (PyObject *) base);
+    return CudaNdarray_set_device_data(self, data, (PyObject *) base);
 }

 /**
@@ -475,10 +493,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
 /**
 * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
 */
-PyObject * 
+PyObject *
 CudaNdarray_CreateArrayObj(CudaNdarray * self);

-PyObject * 
+PyObject *
 CudaNdarray_ZEROS(int n, int * dims);

 /**
@@ -499,7 +517,7 @@ int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pat
 void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 {
    fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
-	    self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
+            self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
    fprintf(fd, "\tHOST_DIMS:      ");
    for (int i = 0; i < self->nd; ++i)
    {
@@ -510,23 +528,23 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
    {
        fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
    }
-    
+
    int data=0;
    fprintf(fd, "\n\tDEV_DIMS:      ");
    for (int i = 0; i < self->nd; ++i)
    {
        cublasGetVector(1, sizeof(int),
-			self->dev_structure+i, 1,
-			&data, 1);
-	fprintf(fd, "%i\t", data);
+                        self->dev_structure+i, 1,
+                        &data, 1);
+        fprintf(fd, "%i\t", data);
    }
    fprintf(fd, "\n\tDEV_STRIDES: ");
    for (int i = 0; i < self->nd; ++i)
    {
        cublasGetVector(1, sizeof(int),
-			self->dev_structure + self->nd+i, 1,
-			&data, 1);
-	fprintf(fd, "%i \t", data);
+                        self->dev_structure + self->nd+i, 1,
+                        &data, 1);
+        fprintf(fd, "%i \t", data);
    }
    fprintf(fd, "\n");
 }

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -6,7 +6,6 @@
 import logging
 _logger = logging.getLogger('theano.tensor.opt')

-import copy
 import operator
 import itertools
 import sys
@@ -574,14 +573,6 @@ class ShapeFeature(object):

        if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
-        # NOTE: This may cause problems bacause the shape is not asserted
-        #       there is an equivalent mechanism to do this, namely
-        #       specify_shape that one should use
-        # If user provided size
-        #elif ( hasattr(r.tag,'shape') and
-        #      r.tag.shape is not None and
-        #      r.tag.shape[i] is not None):
-        #    return T.constant(copy.copy(r.tag.shape[i]),dtype='int64')
        else:
            return Shape_i(i).make_node(r).outputs[0]

@@ -1101,7 +1092,6 @@ def local_alloc_elemwise(node):
    return [node.op(*new)]

 #TODO, global optimizer that lift the assert to the beginning of the graph.
-#TODO, var.tag.shape to propagate the shape and lower the overhead of this op
 #TODO, when all inputs can be optimized do all except one

 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
@@ -2749,14 +2739,8 @@ register_specialize(local_mul_specialize)
 @gof.local_optimizer([T.add])
 def local_add_specialize(node):
    def fill_chain(v):
-        # Not sure why this happens .. but I did not had the time to look
-        # into it, it probably has something to do with the dtype I'm
-        # providing the tag.shape of my variable
        out = _fill_chain(v, node.inputs)
-        if out[0].dtype != node.outputs[0].dtype:
-            return [T.cast(out[0], dtype = node.outputs[0].dtype)]
-        else:
-            return out
+        return out

    #here, we are past the point of canonicalization, so we don't want to put in un-necessary fills.
    if node.op == T.add: