提交 894d6665 authored 作者: Frederic's avatar Frederic

Make GpuAdvancedSubtensor1 use int64 for indices to make sure we support all index number.

上级 e6b2160a
...@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
""" """
#If True or False, we assert that we use the take version or not
#If None, we choose the best one applicable
perform_using_take = None perform_using_take = None
def make_node(self, x, ilist): def make_node(self, x, ilist):
...@@ -1910,8 +1912,9 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1910,8 +1912,9 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_) #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp x, idx = inp
out, = out_ out, = out_
#TODO: if more then 3 dims, reshape the inputs if it is contiguous.
x_orig = x x_orig = x
#TODO: if more then 3 dims, reshape the inputs even if not all
#dimensions are c contiguous
if x.ndim > 3 and x.is_c_contiguous(): if x.ndim > 3 and x.is_c_contiguous():
x = x.reshape((x.shape[0], numpy.prod(x.shape[1:]))) x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
out_shape = (len(idx),) + x_orig.shape[1:] out_shape = (len(idx),) + x_orig.shape[1:]
...@@ -1920,8 +1923,17 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1920,8 +1923,17 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
if self.perform_using_take is not None: if self.perform_using_take is not None:
assert self.perform_using_take == True, ( assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version") "GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64:
o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")), # idx if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
numpy.int64, numpy.uint8, numpy.uint16,
numpy.uint32]:
idx = idx.astype(numpy.int64)
if not idx.flags.c_contiguous:
idx = numpy.ascontiguousarray(idx)
idx = idx.view("float32")
idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
o = x.take(idx,
0, # axis 0, # axis
out_[0][0]) # return out_[0][0]) # return
if x is not x_orig: if x is not x_orig:
......
...@@ -701,14 +701,14 @@ enum operator_t ...@@ -701,14 +701,14 @@ enum operator_t
*/ */
template <int operator_num> template <int operator_num>
__global__ void k_take_3(const int d0, const int d1, const int d2, __global__ void k_take_3(const int d0, const int d1, const int d2,
const float* indices, const npy_int64* indices,
float* a, float* a,
const int sA0, const int sA1, const int sA2, const int sA0, const int sA1, const int sA2,
const float* b, const int dB0, const float* b, const int dB0,
const int sB0, const int sB1, const int sB2, const int sB0, const int sB1, const int sB2,
int* err){ int* err){
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
int idx = (int)indices[i0]; npy_int64 idx = indices[i0];
if (idx<0) if (idx<0)
idx += dB0; // To allow negative indexing. idx += dB0; // To allow negative indexing.
if ((idx < 0) || (idx >= dB0)) if ((idx < 0) || (idx >= dB0))
...@@ -737,8 +737,9 @@ static int* err_var = NULL; ...@@ -737,8 +737,9 @@ static int* err_var = NULL;
// We try to be similat to the PyArray_TakeFrom function // We try to be similat to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap) //TODO: support other clip mode then raise(clip, wrap)
//TODO: what if the indices take more then 32 bits?
//self is the input that we copy data from. //self is the input that we copy data from.
//The indices that we receive MUST be an CudaNdarray(float32)
// that is in fact a view to int64 indices
PyObject* PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
int verbose = 0; int verbose = 0;
...@@ -761,7 +762,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -761,7 +762,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
if (verbose) printf("cudandarray indices\n"); if (verbose) printf("cudandarray indices\n");
indices = (CudaNdarray*) indices_obj; indices = (CudaNdarray*) indices_obj;
Py_INCREF(indices); Py_INCREF(indices);
} else if (PyArray_Check(indices_obj)) { } else if (0 && PyArray_Check(indices_obj)) {
PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray_TakeFrom: The indices must cudandarray with float32 value."); PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray_TakeFrom: The indices must cudandarray with float32 value.");
return NULL; return NULL;
...@@ -800,9 +801,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -800,9 +801,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
return NULL; return NULL;
} }
Py_DECREF(indices_float32); Py_DECREF(indices_float32);
} else { } else {
PyErr_SetString(PyExc_TypeError, "CudaNdarray_TakeFrom: need a CudaNdarray for indices"); PyErr_SetString(PyExc_TypeError,
"CudaNdarray_TakeFrom: need a CudaNdarray(float32) that"
" is a view from int64 data for indices");
return NULL; return NULL;
} }
...@@ -815,11 +817,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -815,11 +817,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
} }
if (verbose) printf("after print of object\n"); if (verbose) printf("after print of object\n");
if(!CudaNdarray_is_c_contiguous(indices) != 0) { if(!CudaNdarray_is_c_contiguous(indices) != 0) {
PyErr_SetString(PyExc_NotImplementedError, "CudaNdarray_TakeFrom: The indices must be contiguous in memory."); PyErr_SetString(PyExc_NotImplementedError,
"CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
Py_DECREF(indices_obj); Py_DECREF(indices_obj);
return NULL; return NULL;
} }
int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices); int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
//Check argument axis //Check argument axis
//TODO: implement the default and other axis //TODO: implement the default and other axis
...@@ -885,7 +888,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -885,7 +888,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(clipmode_obj); Py_DECREF(clipmode_obj);
} }
void (*k3)(const int, const int, const int, void (*k3)(const int, const int, const int,
const float*, const npy_int64*,
float*, const int, const int, const int, float*, const int, const int, const int,
const float*, const int, const float*, const int,
const int, const int, const int, const int, const int, const int,
...@@ -923,7 +926,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -923,7 +926,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
dims[0], dims[0],
1, 1,
1, 1,
CudaNdarray_DEV_DATA(indices), (npy_int64*) CudaNdarray_DEV_DATA(indices),
CudaNdarray_DEV_DATA(out), CudaNdarray_DEV_DATA(out),
CudaNdarray_HOST_STRIDES(out)[0], //strides CudaNdarray_HOST_STRIDES(out)[0], //strides
1, 1,
...@@ -947,7 +950,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -947,7 +950,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
dims[0], //dimensions dims[0], //dimensions
dims[1], dims[1],
1, 1,
CudaNdarray_DEV_DATA(indices), (npy_int64*) CudaNdarray_DEV_DATA(indices),
CudaNdarray_DEV_DATA(out), CudaNdarray_DEV_DATA(out),
CudaNdarray_HOST_STRIDES(out)[0], //strides CudaNdarray_HOST_STRIDES(out)[0], //strides
CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[1],
...@@ -973,7 +976,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -973,7 +976,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
dims[0], //dimensions dims[0], //dimensions
dims[1], dims[1],
dims[2], dims[2],
CudaNdarray_DEV_DATA(indices), (npy_int64*) CudaNdarray_DEV_DATA(indices),
CudaNdarray_DEV_DATA(out), CudaNdarray_DEV_DATA(out),
CudaNdarray_HOST_STRIDES(out)[0], //strides CudaNdarray_HOST_STRIDES(out)[0], //strides
CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[1],
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论