提交 3a4e6c78 authored 作者: lamblin's avatar lamblin

Merge pull request #582 from nouiz/gpu_stuff

Gpu stuff
...@@ -53,6 +53,10 @@ New Features ...@@ -53,6 +53,10 @@ New Features
(Frederic B., Simon McGregor) (Frederic B., Simon McGregor)
* MRG random now raises an error with a clear message when the passed shape * MRG random now raises an error with a clear message when the passed shape
contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.) contains dimensions with bad value like 0. (Frédéric B. reported by Ian G.)
* "CudaNdarray[*] = ndarray" work in more case (Frederic B.)
* "CudaNdarray[*] += ndarray" work in more case (Frederic B.)
* We add dimensions to CudaNdarray to automatically broadcast more frequently.
(Frederic B.)
Sparse Sparse
* Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't * Implement theano.sparse.mul(sparse1, sparse2) when both inputs don't
......
...@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1026,13 +1026,11 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
for r in node.outputs: for r in node.outputs:
if isinstance(r.type, (TensorType, CudaNdarrayType)): if isinstance(r.type, (TensorType, CudaNdarrayType)):
# Build a C-contiguous buffer # Build a C-contiguous buffer
new_buf = numpy.zeros( new_buf = r.type.value_zeros(r_vals[r].shape)
shape=r_vals[r].shape, # CudaNdarray don't have flags field
dtype=r_vals[r].dtype, # assert new_buf.flags["C_CONTIGUOUS"]
order='C') new_buf += numpy.asarray(def_val).astype(r.type.dtype)
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
c_cont_outputs[r] = new_buf c_cont_outputs[r] = new_buf
if len(c_cont_outputs): if len(c_cont_outputs):
...@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1096,21 +1094,12 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
shapes.append(slice(None, size, None)) shapes.append(slice(None, size, None))
r_buf = init_strided[r] r_buf = init_strided[r]
if r_buf.ndim > 0: if r_buf.ndim > 0:
r_buf = r_buf[tuple(strides)][tuple(shapes)] r_buf = r_buf[tuple(strides)][tuple(shapes)]
assert r_buf.shape == r_vals[r].shape assert r_buf.shape == r_vals[r].shape
if isinstance(r.type, CudaNdarrayType): r_buf[...] = numpy.asarray(def_val).astype(r_buf.dtype)
# It seems stupid, but we need to allocate a
# new ndarray and copy it into the GPU one.
# TODO: When it is possible to simply do
# r_buff[...] = def_val, do so.
new_rbuf = numpy.zeros(r_vals[r].shape,
dtype=r.dtype)
new_rbuf += def_val
r_buf[...] = CudaNdarray(new_rbuf)
else:
r_buf[...] = def_val
strided[r] = r_buf strided[r] = r_buf
...@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val, ...@@ -1133,12 +1122,8 @@ def _get_preallocated_maps(node, thunk, prealloc_modes, def_val,
out_shape = [max((s + sd), 0) out_shape = [max((s + sd), 0)
for s, sd in zip(r_vals[r].shape, for s, sd in zip(r_vals[r].shape,
r_shape_diff)] r_shape_diff)]
new_buf = numpy.zeros( new_buf = r.type.value_zeros(r_vals[r].shape)
shape=out_shape, new_buf += numpy.asarray(def_val).astype(r.type.dtype)
dtype=r.dtype)
new_buf += def_val
if isinstance(r.type, CudaNdarrayType):
new_buf = CudaNdarray(new_buf)
wrong_size[r] = new_buf wrong_size[r] = new_buf
yield (name, wrong_size) yield (name, wrong_size)
......
...@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1923,10 +1923,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim == y_.type.ndim
# if (x_.type.ndim - 1) > y_.type.ndim:
# y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
# assert x_.type.ndim == y_.type.ndim
assert x_.type.ndim >= y_.type.ndim assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'): if ilist_.type.dtype[:3] not in ('int', 'uin'):
...@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1941,9 +1937,40 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
return Apply(self, [x_, y_, ilist_], [x_.type()]) return Apply(self, [x_, y_, ilist_], [x_.type()])
#def perform(self, node, inp, out_): # CudaNdarray_Subscript() don't support Advanced slicing.
# CudaNdarray_Subscript() don't support Advanced slicing. # But we can't use the parent version that loop on each indices
# so we use the parent version that loop on each indices. # as we also need to loop when set_instead_of_inc is True and the
# parent don't loop in that case.
def perform(self, node, inp, out_):
# TODO opt to make this inplace
x, y, idx = inp
out, = out_
if not self.inplace:
x = x.copy()
if self.set_instead_of_inc:
# CudaNdarray __setitem__ don't do broadcast nor support
# list of index.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] = y[j]
else:
for i in idx:
x[i] = y
else:
# If `y` has as many dimensions as `x`, then we want to iterate
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
assert len(y) == len(idx)
for (j, i) in enumerate(idx):
x[i] += y[j]
else:
for i in idx:
x[i] += y
out[0] = x
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
......
...@@ -663,7 +663,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -663,7 +663,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
return (PyObject*)rval; return (PyObject*)rval;
} }
PyObject * CudaNdarray_View(CudaNdarray * self) PyObject * CudaNdarray_View(const CudaNdarray * self)
{ {
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd); CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self)) if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
...@@ -985,11 +985,19 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -985,11 +985,19 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"CudaNdarray_inplace_elemwise need a CudaNdarray on left"); "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
return -1; return -1;
} }
CudaNdarray * new_other = NULL;
if (!CudaNdarray_Check(py_other)) { if (!CudaNdarray_Check(py_other)) {
PyErr_SetString( new_other = (CudaNdarray*) CudaNdarray_New();
PyExc_TypeError, if(!new_other)
"CudaNdarray_inplace_elemwise need a CudaNdarray on right"); {
return -1; return -1;
}
if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
{
Py_XDECREF(new_other);
return -1;
}
py_other = (PyObject *) new_other;
} }
CudaNdarray * self = (CudaNdarray *)py_self; CudaNdarray * self = (CudaNdarray *)py_self;
...@@ -1010,6 +1018,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1010,6 +1018,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"CudaNdarray_inplace_elemwise: The destination need more or the" "CudaNdarray_inplace_elemwise: The destination need more or the"
" same number of dimensions then the source. Got %d and %d.", " same number of dimensions then the source. Got %d and %d.",
self->nd, other->nd); self->nd, other->nd);
Py_XDECREF(new_other);
return -1; return -1;
} }
...@@ -1040,6 +1049,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1040,6 +1049,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
PyErr_SetString( PyErr_SetString(
PyExc_ValueError, PyExc_ValueError,
"CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)"); "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
Py_XDECREF(new_other);
return -1; return -1;
} }
// if we're broadcasting other, then make sure it has stride 0 // if we're broadcasting other, then make sure it has stride 0
...@@ -1050,13 +1060,18 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1050,13 +1060,18 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
if (size==0) if (size==0)
{ {
if (CudaNdarray_SIZE((CudaNdarray *)py_other)) int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
if (!(other_size == 0 || other_size == 1))
{ {
PyErr_SetString( PyErr_SetString(
PyExc_ValueError, PyExc_ValueError,
"CudaNdarray_inplace_elemwise cannot work inplace on an un-initialized array"); "CudaNdarray_inplace_elemwise cannot work inplace on"
" un-initialized array when the new value have more then"
" 0 or 1 broadcastable dimensions");
Py_XDECREF(new_other);
return 0; return 0;
} }
Py_XDECREF(new_other);
return 0; return 0;
} }
...@@ -1087,6 +1102,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1087,6 +1102,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k3", "k3",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1119,6 +1135,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1119,6 +1135,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k3", "k3",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1156,6 +1173,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1156,6 +1173,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k3", "k3",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1196,6 +1214,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1196,6 +1214,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k3", "k3",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1240,6 +1259,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1240,6 +1259,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k4", "k4",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1285,6 +1305,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1285,6 +1305,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
"Cuda error: %s: %s.\n", "Cuda error: %s: %s.\n",
"k4", "k4",
cudaGetErrorString(err)); cudaGetErrorString(err));
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
...@@ -1296,11 +1317,13 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t ...@@ -1296,11 +1317,13 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
PyExc_NotImplementedError, PyExc_NotImplementedError,
"inplace_elemwise w nd=%i\n", "inplace_elemwise w nd=%i\n",
self->nd); self->nd);
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
if (verbose) if (verbose)
fprintf(stderr, "INPLACE ADD/DIV end\n"); fprintf(stderr, "INPLACE ADD/DIV end\n");
Py_XDECREF(new_other);
return 0; return 0;
} }
...@@ -1654,7 +1677,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key) ...@@ -1654,7 +1677,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
// See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
// Doesn't handle broadcasting, e.g. a[:] = 5 // Doesn't handle broadcasting, e.g. a[:] = 5
// Can only be assigned from a CudaNdarray on the right side // Can only be assigned from a CudaNdarray on the right side
// Or a ndarray when the left side part is c contiguous. // Or a ndarray
// Or a python scalar with value 0 when the left side part is c contiguous. // Or a python scalar with value 0 when the left side part is c contiguous.
static int static int
CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
...@@ -1663,6 +1686,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1663,6 +1686,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n"); if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
// We try to copy directly into this CudaNdarray from the ndarray // We try to copy directly into this CudaNdarray from the ndarray
CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key); CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
CudaNdarray* new_value = NULL;
if(!rval){ if(!rval){
// CudaNdarray_Subscript failed and set the error msg. // CudaNdarray_Subscript failed and set the error msg.
...@@ -1683,7 +1707,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1683,7 +1707,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
// This case shouldn't happen, based on what I see in Subscript // This case shouldn't happen, based on what I see in Subscript
// but just in case it happens sometime in the future // but just in case it happens sometime in the future
PyErr_Format(PyExc_RuntimeError, "__getitem__ must return a CudaNdarray that refers to the original CudaNdarray, not a copy. rval.base=%p o.base=%p o=%p", PyErr_Format(PyExc_RuntimeError,
"__getitem__ must return a CudaNdarray that refers to"
" the original CudaNdarray, not a copy. rval.base=%p"
" o.base=%p o=%p",
(((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o); (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
Py_DECREF(rval); Py_DECREF(rval);
return -1; return -1;
...@@ -1691,55 +1718,32 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1691,55 +1718,32 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
PyObject * intobj = NULL; PyObject * intobj = NULL;
if(CudaNdarray_Check(o) && PyArray_Check(value)){ if(CudaNdarray_Check(o) && PyArray_Check(value)){
if (verbose) fprintf(stderr, "CudaNdarray_setitem dest is a CudaNdarray and value is a ndarray\n"); if (verbose)
int typenum = PyArray_TYPE(value); fprintf(stderr,
if (typenum != REAL_TYPENUM){ "CudaNdarray_setitem dest is a CudaNdarray and"
PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: can only copy from float32 arrays"); " value is a ndarray\n");
Py_XDECREF(rval); new_value = (CudaNdarray*) CudaNdarray_New();
return -1; if(!new_value)
} {
if(! CudaNdarray_is_c_contiguous(rval)){
PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray.__setitem__: When the new value is an ndarray the part where we copy it to must be c contiguous.");
Py_XDECREF(rval);
return -1;
}
if(rval->nd != ((PyArrayObject*)value)->nd){
PyErr_Format(PyExc_NotImplementedError, "CudaNdarray.__setitem__: need same number of dims. destination nd=%d, source nd=%d. broadcasting implemented only for zeroing values from python scalar.",
rval->nd,((PyArrayObject*)value)->nd);
Py_XDECREF(rval);
return -1; return -1;
} }
for(int i=0 ; i<rval->nd ; i++){ if(CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
if(CudaNdarray_HOST_DIMS(rval)[i] != ((PyArrayObject*)value)->dimensions[i]){ {
PyErr_Format(PyExc_ValueError, "CudaNdarray.__setitem__: need same dimensions for dim %d, destination=%d, source=%ld", Py_XDECREF(new_value);
i,
CudaNdarray_HOST_DIMS(rval)[i],
(long int)(((PyArrayObject*)value)->dimensions[i]));
Py_XDECREF(rval); Py_XDECREF(rval);
return -1; return -1;
}
} }
PyArrayObject * py_v = (PyArrayObject*)PyArray_ContiguousFromAny((PyObject*)value, typenum, value = (PyObject *) new_value;
rval->nd, rval->nd);
cublasSetVector(PyArray_SIZE(py_v),
sizeof(real),
PyArray_DATA(py_v), 1,
rval->devdata, 1);
CNDA_THREAD_SYNC;
Py_XDECREF(py_v);
Py_XDECREF(rval);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: error copying ndarray data to device memory");
return -1;
}
return 0;
} }
else if ((intobj=PyNumber_Int(value))) else if ((intobj=PyNumber_Int(value)))
{ {
if (verbose) fprintf(stderr, "CudaNdarray_setitem dest and value is a python number\n"); if (verbose)
fprintf(stderr,
"CudaNdarray_setitem dest and value is a python number\n");
if(! CudaNdarray_is_c_contiguous(rval)){ if(! CudaNdarray_is_c_contiguous(rval)){
PyErr_SetString(PyExc_NotImplementedError, PyErr_SetString(PyExc_NotImplementedError,
"CudaNdarray.__setitem__: When the new value is a scalar of value 0 the part where we copy to must be c contiguous."); "CudaNdarray.__setitem__: When the new value is a scalar"
" of value 0 the part where we copy to must be c contiguous.");
Py_XDECREF(rval); Py_XDECREF(rval);
return -1; return -1;
} }
...@@ -1748,7 +1752,8 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1748,7 +1752,8 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
Py_DECREF(intobj); intobj=NULL; Py_DECREF(intobj); intobj=NULL;
if (val == 0) if (val == 0)
{ {
cudaError_t err = cudaMemset(rval->devdata, 0, CudaNdarray_SIZE(rval) * sizeof(real)); cudaError_t err = cudaMemset(rval->devdata, 0,
CudaNdarray_SIZE(rval) * sizeof(real));
Py_XDECREF(rval); Py_XDECREF(rval);
if (err) if (err)
{ {
...@@ -1760,7 +1765,8 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1760,7 +1765,8 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
} else { } else {
Py_XDECREF(rval); Py_XDECREF(rval);
PyErr_SetString(PyExc_NotImplementedError, PyErr_SetString(PyExc_NotImplementedError,
"CudaNdarray.__setitem__: we support setting only python scalar of value 0, numpy nd array and CudaNdarray."); "CudaNdarray.__setitem__: we support setting only python"
" scalar of value 0, numpy nd array and CudaNdarray.");
return -1; return -1;
} }
} }
...@@ -1769,16 +1775,25 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1769,16 +1775,25 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value)) if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
{ {
PyErr_SetString(PyExc_TypeError, "CudaNdarray.__setitem__: left must be a CudaNdarrays and right must be a CudaNdarrays, an ndarray or a python scalar of value 0."); PyErr_SetString(PyExc_TypeError,
"CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
" must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
Py_XDECREF(new_value);
return -1; return -1;
} }
if (verbose) fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n"); if (verbose)
fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
if (cnda_copy_structure_to_device(rval)) if (cnda_copy_structure_to_device(rval))
{ {
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray.__setitem__: syncing structure to device failed"); PyErr_SetString(PyExc_RuntimeError,
"CudaNdarray.__setitem__: syncing structure to device failed");
Py_DECREF(rval); Py_DECREF(rval);
if (verbose) fprintf(stderr, "CudaNdarray_setitem error end\n"); Py_XDECREF(new_value);
if (verbose)
fprintf(stderr, "CudaNdarray_setitem error end\n");
return -1; return -1;
} }
...@@ -1787,7 +1802,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1787,7 +1802,10 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true)) if(CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
{ {
Py_DECREF((PyObject*)rval); Py_DECREF((PyObject*)rval);
if (verbose) fprintf(stderr, "CudaNdarray_setitem error end\n"); Py_XDECREF(new_value);
if (verbose)
fprintf(stderr, "CudaNdarray_setitem error end\n");
return -1; return -1;
} }
...@@ -1796,6 +1814,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value) ...@@ -1796,6 +1814,7 @@ CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
// Clean up locally-created references // Clean up locally-created references
Py_DECREF(rval); Py_DECREF(rval);
Py_XDECREF(new_value);
return 0; return 0;
} }
...@@ -2759,7 +2778,9 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo ...@@ -2759,7 +2778,9 @@ static __global__ void k_copy_1d(const int N, const float * x, const int sx, flo
} }
//copy from other into self //copy from other into self
int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * other, bool unbroadcast) int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other,
bool unbroadcast)
{ {
int verbose = 0; int verbose = 0;
if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n"); if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
...@@ -2771,15 +2792,29 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2771,15 +2792,29 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
"can't copy into un-initialized CudaNdarray"); "can't copy into un-initialized CudaNdarray");
return -1; return -1;
} }
if (self->nd != other->nd) CudaNdarray * new_other = NULL;
if (self->nd < other->nd)
{ {
PyErr_Format(PyExc_NotImplementedError, PyErr_Format(PyExc_NotImplementedError,
"CudaNdarray_CopyFromCudaNdarray: need same number of" "CudaNdarray_CopyFromCudaNdarray: The destination need more or the"
" dims. destination nd=%d, source nd=%d." " same number of dimensions then the source. Got %d and %d.",
" No broadcasting implemented.",
self->nd, other->nd); self->nd, other->nd);
return -1; return -1;
} }
else if (self->nd != other->nd)
{
new_other = (CudaNdarray *) CudaNdarray_View(other);
int added_dims = self->nd - other->nd;
int pattern[self->nd];
for(int i = 0; i < added_dims; i++)
pattern[i] = -1;
for(int i = 0; i < other->nd; i++)
pattern[i + added_dims] = i;
CudaNdarray_dimshuffle(new_other, self->nd, pattern);
other = new_other;
}
assert(self->nd == other->nd);
//standard elemwise dim checks (also compute total size) //standard elemwise dim checks (also compute total size)
unsigned int size = 1; unsigned int size = 1;
unsigned int size_source = 1; unsigned int size_source = 1;
...@@ -2793,13 +2828,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2793,13 +2828,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
" destination=%d, source=%d", " destination=%d, source=%d",
i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(self)[i],
CudaNdarray_HOST_DIMS(other)[i]); CudaNdarray_HOST_DIMS(other)[i]);
return -1; Py_XDECREF(new_other);
return -1;
} }
size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i]; size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i]; size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
} }
if (0 == size) if (0 == size)
{ {
Py_XDECREF(new_other);
return 0; //nothing to copy, we're done. return 0; //nothing to copy, we're done.
} }
if (CudaNdarray_is_c_contiguous(self) && if (CudaNdarray_is_c_contiguous(self) &&
...@@ -2812,6 +2849,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2812,6 +2849,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
CudaNdarray_DEV_DATA(self), 1); CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
Py_XDECREF(new_other);
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
...@@ -2849,6 +2887,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2849,6 +2887,7 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
"Cuda error: %s: %s. (n_blocks=%i," "Cuda error: %s: %s. (n_blocks=%i,"
" n_threads_per_block=%i)\n", "k_copy_1d", " n_threads_per_block=%i)\n", "k_copy_1d",
cudaGetErrorString(err), n_blocks, n_threads); cudaGetErrorString(err), n_blocks, n_threads);
Py_XDECREF(new_other);
return -1; return -1;
} }
}; break; }; break;
...@@ -2893,10 +2932,12 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2893,10 +2932,12 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
"k_elemwise_unary_rowmajor_copy", "k_elemwise_unary_rowmajor_copy",
cudaGetErrorString(err), n_blocks, cudaGetErrorString(err), n_blocks,
threads_per_block); threads_per_block);
Py_XDECREF(new_other);
return -1; return -1;
} }
} }
}; };
Py_XDECREF(new_other);
return 0; return 0;
} }
...@@ -4088,7 +4129,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd) ...@@ -4088,7 +4129,7 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
return 0; return 0;
} }
int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base) int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
{ {
return CudaNdarray_set_device_data(self, data, (PyObject *) base); return CudaNdarray_set_device_data(self, data, (PyObject *) base);
} }
......
...@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims) ...@@ -282,7 +282,7 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`. * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/ */
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base); DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base); DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);
/** /**
* Return an independent copy of self * Return an independent copy of self
......
...@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -765,8 +765,6 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
...@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -799,8 +797,7 @@ def local_gpu_advanced_incsubtensor1(node):
'either set the `warn.gpu_set_subtensor1` config ' 'either set the `warn.gpu_set_subtensor1` config '
'option to False, or `warn.ignore_bug_before` to at ' 'option to False, or `warn.ignore_bug_before` to at '
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
if set_instead_of_inc:
return
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))] return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
......
...@@ -630,13 +630,9 @@ def test_setitem_matrixvector1(): ...@@ -630,13 +630,9 @@ def test_setitem_matrixvector1():
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy #test direct transfert from numpy
try: _a[:,1] = b*100
_a[:,1] = b*100 a[:,1] = b*100
a[:,1] = b*100 assert numpy.allclose(a,numpy.asarray(_a))
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError, e:
pass
row = theano._asarray([777,888,999], dtype='float32') row = theano._asarray([777,888,999], dtype='float32')
_a[1,:] = row _a[1,:] = row
...@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3(): ...@@ -659,13 +655,9 @@ def test_setitem_matrix_tensor3():
assert numpy.allclose(a,numpy.asarray(_a)) assert numpy.allclose(a,numpy.asarray(_a))
#test direct transfert from numpy #test direct transfert from numpy
try: _a[:,1,1] = b*100
_a[:,1,1] = b*100 a[:,1,1] = b*100
a[:,1,1] = b*100 assert numpy.allclose(a,numpy.asarray(_a))
raise Exception("CudaNdarray.__setitem__ should have returned an error")
assert numpy.allclose(a,numpy.asarray(_a))
except NotImplementedError:
pass
row = theano._asarray([777,888,999], dtype='float32') row = theano._asarray([777,888,999], dtype='float32')
_a[1,1,:] = row _a[1,1,:] = row
...@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -714,7 +706,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:,:,1] = _b _a[:,:,1] = _b
assert False assert False
except NotImplementedError, e: except ValueError, e:
#print e #print e
assert True assert True
...@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -723,7 +715,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[1,:,:] = b _a[1,:,:] = b
assert False assert False
except NotImplementedError, e: except ValueError, e:
#print e #print e
assert True assert True
...@@ -806,7 +798,7 @@ def test_setitem_broadcast(): ...@@ -806,7 +798,7 @@ def test_setitem_broadcast():
a[:,:,1] = b.reshape((1,3)) a[:,:,1] = b.reshape((1,3))
assert numpy.allclose(numpy.asarray(_a),a) assert numpy.allclose(numpy.asarray(_a),a)
#This is not supported for now.
def test_setitem_broadcast_numpy(): def test_setitem_broadcast_numpy():
#test scalar to vector without stride #test scalar to vector without stride
a = numpy.arange(3) a = numpy.arange(3)
...@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy(): ...@@ -814,73 +806,81 @@ def test_setitem_broadcast_numpy():
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray(9, dtype='float32') b = theano._asarray(9, dtype='float32')
try: _a[:] = b.reshape((1,))
_a[:] = b.reshape((1,)) a[:] = b.reshape((1,))
a[:] = b.reshape((1,)) assert numpy.allclose(numpy.asarray(_a), a)
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
#test vector to matrice without stride #test vector to matrice without stride
a = numpy.arange(9) a = numpy.arange(9)
a.resize((3,3)) a.resize((3, 3))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
try: b = theano._asarray([7, 8, 9], dtype='float32')
b = theano._asarray([7,8,9], dtype='float32') _a[:, :] = b.reshape((1, 3))
_a[:,:] = b.reshape((1,3)) a[:, :] = b.reshape((1, 3))
a[:,:] = b.reshape((1,3)) assert numpy.allclose(numpy.asarray(_a), a)
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
#test vector to matrice with stride #test vector to matrice with stride
a = numpy.arange(27) a = numpy.arange(27)
a.resize((3,3,3)) a.resize((3, 3, 3))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
try: b = theano._asarray([[7, 8, 9], [10, 11, 12]], dtype='float32')
b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32') b = b[0]
b = b[0] _a[1, :, :] = b.reshape((1, 3))
_a[1,:,:] = b.reshape((1,3)) a[1, :, :] = b.reshape((1, 3))
a[1,:,:] = b.reshape((1,3)) assert numpy.allclose(numpy.asarray(_a), a)
assert False
assert numpy.allclose(numpy.asarray(_a),a)
except ValueError:
pass
# this also fails for the moment # this also fails for the moment
def test_setitem_rightvalue_ndarray_fails(): def test_setitem_rightvalue_ndarray_fails():
""" """
Now we don't automatically add dimensions to broadcast Now we don't automatically add dimensions to broadcast
""" """
a = numpy.arange(27) a = numpy.arange(3 * 4 * 5)
a.resize((3,3,3)) a.resize((3, 4, 5))
a = theano._asarray(a, dtype='float32') a = theano._asarray(a, dtype='float32')
_a = cuda_ndarray.CudaNdarray(a) _a = cuda_ndarray.CudaNdarray(a)
b = theano._asarray([7,8,9], dtype='float32') b = theano._asarray([7, 8, 9, 10], dtype='float32')
_b = cuda_ndarray.CudaNdarray(b) _b = cuda_ndarray.CudaNdarray(b)
b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
_b5 = cuda_ndarray.CudaNdarray(b)
# attempt to assign the ndarray b with setitem
_a[:, :, 1] = _b
a[:, :, 1] = b
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy to contiguous region
# attempt to assign the ndarray b with setitem
# same number of dim
mat = numpy.random.rand(4, 5).astype('float32')
_a[2, :, :] = mat
a[2, :, :] = mat
assert numpy.allclose(numpy.asarray(_a), a)
# without same number of dim
try: try:
# attempt to assign the ndarray b with setitem _a[0, :, :] = mat
_a[:,:,1] = _b #a[0, :, :] = mat
assert False #assert numpy.allclose(numpy.asarray(_a), a)
except NotImplementedError, e: except ValueError, e:
#print e pass
assert True
#test direct transfert from numpy with broadcast
_a[0, :, :] = b5
a[0, :, :] = b5
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy to not contiguous region
# attempt to assign the ndarray b with setitem
_a[:, :, 2] = b
a[:, :, 2] = b
assert numpy.allclose(numpy.asarray(_a), a)
#test direct transfert from numpy
try:
# attempt to assign the ndarray b with setitem
_a[:,:,1] = b
assert False
except NotImplementedError, e:
#print e
assert True
def test_zeros_basic(): def test_zeros_basic():
for shp in [(3,4,5), (300,), (), (0,7)]: for shp in [(3,4,5), (300,), (), (0,7)]:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论