Merge pull request #582 from nouiz/gpu_stuff

Gpu stuff

Merge pull request #582 from nouiz/gpu_stuff
3a4e6c78 · lamblin · 9aa99867 · 6fa74303 · 3a4e6c78 · 3a4e6c78
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -53,6 +53,10 @@ New Features
   (Frederic B., Simon McGregor)
 * MRG random now raises an error with a clear message when the passed shape
   contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.)
+ * "CudaNdarray[*] = ndarray" work in more case (Frederic B.)
+ * "CudaNdarray[*] += ndarray" work in more case (Frederic B.)
+ * We add dimensions to CudaNdarray to automatically broadcast more frequently.
+   (Frederic B.)

 Sparse
 * Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
        for r in node.outputs:
            if isinstance(r.type, (TensorType, CudaNdarrayType)):
                # Build a C-contiguous buffer
-                new_buf = numpy.zeros(
-                        shape=r_vals[r].shape,
-                        dtype=r_vals[r].dtype,
-                        order='C')
-                new_buf += def_val
-                if isinstance(r.type, CudaNdarrayType):
-                    new_buf = CudaNdarray(new_buf)
+                new_buf = r.type.value_zeros(r_vals[r].shape)
+                # CudaNdarray don't have flags field
+                # assert new_buf.flags["C_CONTIGUOUS"]
+                new_buf += numpy.asarray(def_val).astype(r.type.dtype)
+
                c_cont_outputs[r] = new_buf

        if len(c_cont_outputs):
@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                            shapes.append(slice(None, size, None))

                        r_buf = init_strided[r]
+
                        if r_buf.ndim > 0:
                            r_buf = r_buf[tuple(strides)][tuple(shapes)]
                        assert r_buf.shape == r_vals[r].shape

-                        if isinstance(r.type, CudaNdarrayType):
-                            # It seems stupid, but we need to allocate a
-                            # new ndarray and copy it into the GPU one.
-                            # TODO: When it is possible to simply do
-                            # r_buff[...] = def_val, do so.
-                            new_rbuf = numpy.zeros(r_vals[r].shape,
-                                    dtype=r.dtype)
-                            new_rbuf += def_val
-                            r_buf[...] = CudaNdarray(new_rbuf)
-                        else:
-                            r_buf[...] = def_val
+                        r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)

                        strided[r] = r_buf

@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
                        out_shape = [max((s + sd), 0)
                                for s, sd in zip(r_vals[r].shape,
                                                 r_shape_diff)]
-                        new_buf = numpy.zeros(
-                                shape=out_shape,
-                                dtype=r.dtype)
-                        new_buf += def_val
-                        if isinstance(r.type, CudaNdarrayType):
-                            new_buf = CudaNdarray(new_buf)
+                        new_buf = r.type.value_zeros(r_vals[r].shape)
+                        new_buf += numpy.asarray(def_val).astype(r.type.dtype)
                        wrong_size[r] = new_buf

                yield (name, wrong_size)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.dtype == y_.type.dtype
-        assert x_.type.ndim == y_.type.ndim
-#        if (x_.type.ndim - 1) > y_.type.ndim:
-#            y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
-#        assert x_.type.ndim == y_.type.ndim
        assert x_.type.ndim >= y_.type.ndim

        if ilist_.type.dtype[:3] not in ('int', 'uin'):
@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):

        return Apply(self, [x_, y_, ilist_], [x_.type()])

-    #def perform(self, node, inp, out_):
-        # CudaNdarray_Subscript() don't support Advanced slicing.
-        # so we use the parent version that loop on each indices.
+    # CudaNdarray_Subscript() don't support Advanced slicing.
+    # But we can't use the parent version that loop on each indices
+    # as we also need to loop when set_instead_of_inc is True and the
+    # parent don't loop in that case.
+    def perform(self, node, inp, out_):
+        # TODO opt to make this inplace
+        x, y, idx = inp
+        out, = out_
+        if not self.inplace:
+            x = x.copy()
+        if self.set_instead_of_inc:
+            # CudaNdarray __setitem__ don't do broadcast nor support
+            # list of index.
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+            if y.ndim == x.ndim:
+                assert len(y) == len(idx)
+                for (j, i) in enumerate(idx):
+                    x[i] = y[j]
+            else:
+                for i in idx:
+                    x[i] = y
+        else:
+            # If `y` has as many dimensions as `x`, then we want to iterate
+            # jointly on `x` and `y`. Otherwise, it means `y` should be
+            # broadcasted to fill all relevant rows of `x`.
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+            if y.ndim == x.ndim:
+                assert len(y) == len(idx)
+                for (j, i) in enumerate(idx):
+                    x[i] += y[j]
+            else:
+                for i in idx:
+                    x[i] += y
+        out[0] = x


 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -663,7 +663,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
    return (PyObject*)rval;
 }

-PyObject * CudaNdarray_View(CudaNdarray * self)
+PyObject * CudaNdarray_View(const CudaNdarray * self)
 {
    CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
    if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
@@ -985,11 +985,19 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
        return -1;
    }
+    CudaNdarray * new_other = NULL;
    if (!CudaNdarray_Check(py_other)) {
-        PyErr_SetString(
-            PyExc_TypeError,
-            "CudaNdarray_inplace_elemwise need a CudaNdarray on right");
-        return -1;
+        new_other = (CudaNdarray*) CudaNdarray_New();
+        if(!new_other)
+        {
+            return -1;
+        }
+        if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
+        {
+            Py_XDECREF(new_other);
+            return -1;
+        }
+        py_other = (PyObject *) new_other;
    }

    CudaNdarray * self = (CudaNdarray *)py_self;
@@ -1010,6 +1018,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            "CudaNdarray_inplace_elemwise: The destination need more or the"
            " same number of dimensions then the source. Got %d and %d.",
            self->nd, other->nd);
+        Py_XDECREF(new_other);
        return -1;
    }

@@ -1040,6 +1049,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            PyErr_SetString(
                PyExc_ValueError,
                "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
+            Py_XDECREF(new_other);
            return -1;
        }
        // if we're broadcasting other, then make sure it has stride 0
@@ -1050,13 +1060,18 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t

    if (size==0)
    {
-        if (CudaNdarray_SIZE((CudaNdarray *)py_other))
+        int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
+        if (!(other_size == 0 || other_size == 1))
        {
            PyErr_SetString(
                PyExc_ValueError,
-                "CudaNdarray_inplace_elemwise cannot work inplace on an un-initialized array");
+                "CudaNdarray_inplace_elemwise cannot work inplace on"
+                " un-initialized array when the new value have more then"
+                " 0 or 1 broadcastable dimensions");
+            Py_XDECREF(new_other);
            return 0;
        }
+        Py_XDECREF(new_other);
        return 0;
    }

@@ -1087,6 +1102,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        "Cuda error: %s: %s.\n",
                        "k3",
                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
@@ -1119,6 +1135,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        "Cuda error: %s: %s.\n",
                        "k3",
                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
@@ -1156,6 +1173,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        "Cuda error: %s: %s.\n",
                        "k3",
                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
@@ -1196,6 +1214,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        "Cuda error: %s: %s.\n",
                        "k3",
                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
@@ -1240,6 +1259,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        "Cuda error: %s: %s.\n",
                        "k4",
                        cudaGetErrorString(err));
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
@@ -1285,6 +1305,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                            "Cuda error: %s: %s.\n",
                            "k4",
                            cudaGetErrorString(err));
+                        Py_XDECREF(new_other);
                        return -1;
                    }
                }
@@ -1296,11 +1317,13 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                PyExc_NotImplementedError,
                "inplace_elemwise w nd=%i\n",
                self->nd);
+            Py_XDECREF(new_other);
            return -1;
        }
    }
    if (verbose)
        fprintf(stderr, "INPLACE ADD/DIV end\n");
+    Py_XDECREF(new_other);
    return 0;
 }

@@ -1654,7 +1677,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
 // Doesn't handle broadcasting, e.g. a[:] = 5
 // Can only be assigned from a CudaNdarray on the right side
-// Or a ndarray when the left side part is c contiguous.
+// Or a ndarray
 // Or a python scalar with value 0 when the left side part is c contiguous.
 static int
 CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
@@ -1663,6 +1686,7 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
    if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
    // We try to copy directly into this CudaNdarray from the ndarray
    CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
+    CudaNdarray* new_value = NULL;

    if(!rval){
        // CudaNdarray_Subscript failed and set the error msg.
@@ -1683,7 +1707,10 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
        // This case shouldn't happen, based on what I see in Subscript
        // but just in case it happens sometime in the future

-        PyErr_Format(PyExc_RuntimeError, "__getitem__ must return a CudaNdarray that refers to the original CudaNdarray, not a copy. rval.base=%p o.base=%p o=%p",
+        PyErr_Format(PyExc_RuntimeError,
+                     "__getitem__ must return a CudaNdarray that refers to"
+                     " the original CudaNdarray, not a copy. rval.base=%p"
+                     " o.base=%p o=%p",
                     (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
        Py_DECREF(rval);
        return -1;
@@ -1691,55 +1718,32 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)

    PyObject * intobj = NULL;
    if(CudaNdarray_Check(o)  && PyArray_Check(value)){
-        if (verbose) fprintf(stderr, "CudaNdarray_setitem dest is a CudaNdarray and value is a ndarray\n");
-        int typenum = PyArray_TYPE(value);
-        if (typenum != REAL_TYPENUM){
-            PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: can only copy from float32 arrays");
-            Py_XDECREF(rval);
-            return -1;
-        }
-        if(! CudaNdarray_is_c_contiguous(rval)){
-            PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray.__setitem__: When the new value is an ndarray the part where we copy it to must be c contiguous.");
-            Py_XDECREF(rval);
-            return -1;
-        }
-        if(rval->nd != ((PyArrayObject*)value)->nd){
-            PyErr_Format(PyExc_NotImplementedError, "CudaNdarray.__setitem__: need same number of dims. destination nd=%d, source nd=%d. broadcasting implemented only for zeroing values from python scalar.",
-                         rval->nd,((PyArrayObject*)value)->nd);
-            Py_XDECREF(rval);
+        if (verbose)
+            fprintf(stderr,
+                    "CudaNdarray_setitem dest is a CudaNdarray and"
+                    " value is a ndarray\n");
+        new_value = (CudaNdarray*) CudaNdarray_New();
+        if(!new_value)
+        {
            return -1;
        }
-        for(int i=0 ; i<rval->nd ; i++){
-          if(CudaNdarray_HOST_DIMS(rval)[i] != ((PyArrayObject*)value)->dimensions[i]){
-            PyErr_Format(PyExc_ValueError, "CudaNdarray.__setitem__: need same dimensions for dim %d, destination=%d, source=%ld",
-                i,
-                CudaNdarray_HOST_DIMS(rval)[i],
-                (long int)(((PyArrayObject*)value)->dimensions[i]));
+        if(CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
+        {
+            Py_XDECREF(new_value);
            Py_XDECREF(rval);
            return -1;
-          }
        }
-        PyArrayObject * py_v = (PyArrayObject*)PyArray_ContiguousFromAny((PyObject*)value, typenum,
-                                rval->nd, rval->nd);
-        cublasSetVector(PyArray_SIZE(py_v),
-                        sizeof(real),
-                        PyArray_DATA(py_v), 1,
-                        rval->devdata, 1);
-        CNDA_THREAD_SYNC;
-        Py_XDECREF(py_v);
-        Py_XDECREF(rval);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
-          PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: error copying ndarray data to device memory");
-          return -1;
-        }
-        return 0;
+        value = (PyObject *) new_value;
    }
    else if ((intobj=PyNumber_Int(value)))
    {
-        if (verbose) fprintf(stderr, "CudaNdarray_setitem dest and value is a python number\n");
+        if (verbose)
+            fprintf(stderr,
+                    "CudaNdarray_setitem dest and value is a python number\n");
        if(! CudaNdarray_is_c_contiguous(rval)){
            PyErr_SetString(PyExc_NotImplementedError,
-                            "CudaNdarray.__setitem__: When the new value is a scalar of value 0 the part where we copy to must be c contiguous.");
+                 "CudaNdarray.__setitem__: When the new value is a scalar"
+                 " of value 0 the part where we copy to must be c contiguous.");
            Py_XDECREF(rval);
            return -1;
        }
@@ -1748,7 +1752,8 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
        Py_DECREF(intobj); intobj=NULL;
        if (val == 0)
        {
-            cudaError_t err = cudaMemset(rval->devdata, 0, CudaNdarray_SIZE(rval) * sizeof(real));
+            cudaError_t err = cudaMemset(rval->devdata, 0,
+                                         CudaNdarray_SIZE(rval) * sizeof(real));
            Py_XDECREF(rval);
            if (err)
            {
@@ -1760,7 +1765,8 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
        } else {
            Py_XDECREF(rval);
            PyErr_SetString(PyExc_NotImplementedError,
-                                "CudaNdarray.__setitem__: we support setting only python scalar of value 0, numpy nd array and CudaNdarray.");
+                  "CudaNdarray.__setitem__: we support setting only python"
+                  " scalar of value 0, numpy nd array and CudaNdarray.");
                return -1;
        }
    }
@@ -1769,16 +1775,25 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)

    if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
    {
-        PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: left must be a CudaNdarrays and right must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
+        PyErr_SetString(PyExc_TypeError,
+          "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
+          " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
+        Py_XDECREF(new_value);
        return -1;
    }

-    if (verbose) fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
+    if (verbose)
+        fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
+
    if (cnda_copy_structure_to_device(rval))
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: syncing structure to device failed");
+        PyErr_SetString(PyExc_RuntimeError,
+                "CudaNdarray.__setitem__: syncing structure to device failed");
        Py_DECREF(rval);
-        if (verbose) fprintf(stderr, "CudaNdarray_setitem error end\n");
+        Py_XDECREF(new_value);
+
+        if (verbose)
+            fprintf(stderr, "CudaNdarray_setitem error end\n");
        return -1;
    }

@@ -1787,7 +1802,10 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)
    if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
    {
        Py_DECREF((PyObject*)rval);
-        if (verbose) fprintf(stderr, "CudaNdarray_setitem error end\n");
+        Py_XDECREF(new_value);
+
+        if (verbose)
+            fprintf(stderr, "CudaNdarray_setitem error end\n");
        return -1;
    }

@@ -1796,6 +1814,7 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *value)

    // Clean up locally-created references
    Py_DECREF(rval);
+    Py_XDECREF(new_value);

    return 0;
 }
@@ -2759,7 +2778,9 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
 }

 //copy from other into self
-int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast)
+int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
+                                    const CudaNdarray * other,
+                                    bool unbroadcast)
 {
    int verbose = 0;
    if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
@@ -2771,15 +2792,29 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
                        "can't copy into un-initialized CudaNdarray");
        return -1;
    }
-    if (self->nd != other->nd)
+    CudaNdarray * new_other = NULL;
+
+    if (self->nd < other->nd)
    {
        PyErr_Format(PyExc_NotImplementedError,
-                     "CudaNdarray_CopyFromCudaNdarray: need same number of"
-                     " dims. destination nd=%d, source nd=%d."
-                     " No broadcasting implemented.",
+            "CudaNdarray_CopyFromCudaNdarray: The destination need more or the"
+            " same number of dimensions then the source. Got %d and %d.",
                     self->nd, other->nd);
        return -1;
    }
+    else if (self->nd != other->nd)
+    {
+        new_other = (CudaNdarray *) CudaNdarray_View(other);
+        int added_dims = self->nd - other->nd;
+        int pattern[self->nd];
+        for(int i = 0; i < added_dims; i++)
+            pattern[i] = -1;
+        for(int i = 0; i < other->nd; i++)
+            pattern[i + added_dims] = i;
+        CudaNdarray_dimshuffle(new_other, self->nd, pattern);
+        other = new_other;
+    }
+    assert(self->nd == other->nd);
    //standard elemwise dim checks (also compute total size)
    unsigned int size = 1;
    unsigned int size_source = 1;
@@ -2793,13 +2828,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
                       " destination=%d, source=%d",
                       i, CudaNdarray_HOST_DIMS(self)[i],
                       CudaNdarray_HOST_DIMS(other)[i]);
-            return -1;
+          Py_XDECREF(new_other);
+          return -1;
        }
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
        size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
    }
    if (0 == size)
    {
+        Py_XDECREF(new_other);
        return 0; //nothing to copy, we're done.
    }
    if (CudaNdarray_is_c_contiguous(self) &&
@@ -2812,6 +2849,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
                    CudaNdarray_DEV_DATA(self), 1);
        CNDA_THREAD_SYNC;
+        Py_XDECREF(new_other);
        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
        {
            PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
@@ -2849,6 +2887,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
                                 "Cuda error: %s: %s. (n_blocks=%i,"
                                 " n_threads_per_block=%i)\n", "k_copy_1d",
                                 cudaGetErrorString(err), n_blocks, n_threads);
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }; break;
@@ -2893,10 +2932,12 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
                                 "k_elemwise_unary_rowmajor_copy",
                                 cudaGetErrorString(err), n_blocks,
                                 threads_per_block);
+                    Py_XDECREF(new_other);
                    return -1;
                }
            }
    };
+    Py_XDECREF(new_other);
    return 0;
 }

@@ -4088,7 +4129,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
    return 0;
 }

-int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
 {
    return CudaNdarray_set_device_data(self, data, (PyObject *) base);
 }

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
 * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
 */
 DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
-DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
+DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);

 /**
 * Return an independent copy of self

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
-            if set_instead_of_inc:
-                return

            gpu_op = GpuAdvancedIncSubtensor1(
                set_instead_of_inc=set_instead_of_inc)
@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
-            if set_instead_of_inc:
-                return
+
            gpu_op = GpuAdvancedIncSubtensor1(
                set_instead_of_inc=set_instead_of_inc)
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]

--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
@@ -630,13 +630,9 @@ def test_setitem_matrixvector1():
    assert numpy.allclose(a,numpy.asarray(_a))

    #test direct transfert from numpy
-    try:
-        _a[:,1] =  b*100
-        a[:,1] =  b*100
-        raise Exception("CudaNdarray.__setitem__ should have returned an error")
-        assert numpy.allclose(a,numpy.asarray(_a))
-    except NotImplementedError, e:
-        pass
+    _a[:,1] =  b*100
+    a[:,1] =  b*100
+    assert numpy.allclose(a,numpy.asarray(_a))

    row = theano._asarray([777,888,999], dtype='float32')
    _a[1,:] = row
@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3():
    assert numpy.allclose(a,numpy.asarray(_a))

    #test direct transfert from numpy
-    try:
-        _a[:,1,1] = b*100
-        a[:,1,1] = b*100
-        raise Exception("CudaNdarray.__setitem__ should have returned an error")
-        assert numpy.allclose(a,numpy.asarray(_a))
-    except NotImplementedError:
-        pass
+    _a[:,1,1] = b*100
+    a[:,1,1] = b*100
+    assert numpy.allclose(a,numpy.asarray(_a))

    row = theano._asarray([777,888,999], dtype='float32')
    _a[1,1,:] = row
@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[:,:,1] = _b
        assert False
-    except NotImplementedError, e:
+    except ValueError, e:
        #print e
        assert True

@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim():
        # attempt to assign the ndarray b with setitem
        _a[1,:,:] = b
        assert False
-    except NotImplementedError, e:
+    except ValueError, e:
        #print e
        assert True

@@ -806,7 +798,7 @@ def test_setitem_broadcast():
    a[:,:,1] = b.reshape((1,3))
    assert numpy.allclose(numpy.asarray(_a),a)

-#This is not supported for now.
+
 def test_setitem_broadcast_numpy():
    #test scalar to vector without stride
    a = numpy.arange(3)
@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy():
    _a = cuda_ndarray.CudaNdarray(a)

    b = theano._asarray(9, dtype='float32')
-    try:
-        _a[:] = b.reshape((1,))
-        a[:] = b.reshape((1,))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    _a[:] = b.reshape((1,))
+    a[:] = b.reshape((1,))
+    assert numpy.allclose(numpy.asarray(_a), a)
+
    #test vector to matrice without stride
    a = numpy.arange(9)
-    a.resize((3,3))
+    a.resize((3, 3))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    try:
-        b = theano._asarray([7,8,9], dtype='float32')
-        _a[:,:] = b.reshape((1,3))
-        a[:,:] = b.reshape((1,3))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    b = theano._asarray([7, 8, 9], dtype='float32')
+    _a[:, :] = b.reshape((1, 3))
+    a[:, :] = b.reshape((1, 3))
+    assert numpy.allclose(numpy.asarray(_a), a)

    #test vector to matrice with stride
    a = numpy.arange(27)
-    a.resize((3,3,3))
+    a.resize((3, 3, 3))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    try:
-        b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32')
-        b = b[0]
-        _a[1,:,:] = b.reshape((1,3))
-        a[1,:,:] = b.reshape((1,3))
-        assert False
-        assert numpy.allclose(numpy.asarray(_a),a)
-    except ValueError:
-        pass
+    b = theano._asarray([[7, 8, 9], [10, 11, 12]], dtype='float32')
+    b = b[0]
+    _a[1, :, :] = b.reshape((1, 3))
+    a[1, :, :] = b.reshape((1, 3))
+    assert numpy.allclose(numpy.asarray(_a), a)
+

 # this also fails for the moment
 def test_setitem_rightvalue_ndarray_fails():
    """
    Now we don't automatically add dimensions to broadcast
    """
-    a = numpy.arange(27)
-    a.resize((3,3,3))
+    a = numpy.arange(3 * 4 * 5)
+    a.resize((3, 4, 5))
    a = theano._asarray(a, dtype='float32')
    _a = cuda_ndarray.CudaNdarray(a)

-    b = theano._asarray([7,8,9], dtype='float32')
+    b = theano._asarray([7, 8, 9, 10], dtype='float32')
    _b = cuda_ndarray.CudaNdarray(b)
-
+    b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
+    _b5 = cuda_ndarray.CudaNdarray(b)
+
+    # attempt to assign the ndarray b with setitem
+    _a[:, :, 1] = _b
+    a[:, :, 1] = b
+    assert numpy.allclose(numpy.asarray(_a), a)
+
+    #test direct transfert from numpy to contiguous region
+    # attempt to assign the ndarray b with setitem
+    # same number of dim
+    mat = numpy.random.rand(4, 5).astype('float32')
+    _a[2, :, :] = mat
+    a[2, :, :] = mat
+    assert numpy.allclose(numpy.asarray(_a), a)
+
+    # without same number of dim
    try:
-        # attempt to assign the ndarray b with setitem
-        _a[:,:,1] = _b
-        assert False
-    except NotImplementedError, e:
-        #print e
-        assert True
+        _a[0, :, :] = mat
+        #a[0, :, :] = mat
+        #assert numpy.allclose(numpy.asarray(_a), a)
+    except ValueError, e:
+        pass
+
+    #test direct transfert from numpy with broadcast
+    _a[0, :, :] = b5
+    a[0, :, :] = b5
+    assert numpy.allclose(numpy.asarray(_a), a)
+
+    #test direct transfert from numpy to not contiguous region
+    # attempt to assign the ndarray b with setitem
+    _a[:, :, 2] = b
+    a[:, :, 2] = b
+    assert numpy.allclose(numpy.asarray(_a), a)

-    #test direct transfert from numpy
-    try:
-        # attempt to assign the ndarray b with setitem
-        _a[:,:,1] = b
-        assert False
-    except NotImplementedError, e:
-        #print e
-        assert True

 def test_zeros_basic():
    for shp in [(3,4,5), (300,), (), (0,7)]: