提交 bd00e506 authored 作者: Frederic's avatar Frederic

Many fix following the code review.

- add/update comments - rename assert_fast - fix crash - remove useless code This do change anything related to the int casted to float32.
上级 6c955ecf
...@@ -1891,7 +1891,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1891,7 +1891,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
""" """
assert_fast = None perform_using_take = None
def make_node(self, x, ilist): def make_node(self, x, ilist):
x_ = as_cuda_ndarray_variable(x) x_ = as_cuda_ndarray_variable(x)
...@@ -1910,30 +1910,29 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1910,30 +1910,29 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_) #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp x, idx = inp
out, = out_ out, = out_
new_method = True
#TODO: if more then 3 dims, reshape the inputs if it is contiguous. #TODO: if more then 3 dims, reshape the inputs if it is contiguous.
x_orig = x x_orig = x
if x.ndim > 3 and x.is_c_contiguous(): if x.ndim > 3 and x.is_c_contiguous():
x = x.reshape((x.shape[0], numpy.prod(x.shape[1:]))) x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
out_shape = (len(idx),) + x_orig.shape[1:]
if x.ndim <= 3: if x.ndim <= 3:
if self.assert_fast is not None: # CudaNdarray.take only supports ndim <= 3
assert self.assert_fast == True, ( if self.perform_using_take is not None:
assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version") "GpuAdvancedSubtensor1 used the fast version")
# Support x with dimensions 1,2,3 only. o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")), # idx
o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")), 0, # axis
0, out_[0][0]) # idx, axis, return[, clipmode] out_[0][0]) # return
if x is not x_orig: if x is not x_orig:
o = o.reshape((len(idx),) + x_orig.shape[1:]) o = o.reshape(out_shape)
out[0] = o out[0] = o
else: else:
if self.assert_fast is not None: if self.perform_using_take is not None:
assert self.assert_fast == False, ( assert self.perform_using_take == False, (
"GpuAdvancedSubtensor1 didn't used the fast version") "GpuAdvancedSubtensor1 didn't use the fast version")
if (out_[0][0] is None or out_[0][0].shape != (len(idx),) + if out_[0][0] is None or out_[0][0].shape != out_shape:
x.shape[1:]): o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
x.shape[1:])
else: else:
o = out_[0][0] o = out_[0][0]
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
......
...@@ -712,6 +712,10 @@ __global__ void k_take_3(const int d0, const int d1, const int d2, ...@@ -712,6 +712,10 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
if (idx<0) if (idx<0)
idx += dB0; // To allow negative indexing. idx += dB0; // To allow negative indexing.
if ((idx < 0) || (idx >= dB0)) if ((idx < 0) || (idx >= dB0))
// Any value other the 0 probably work. But to be more safe, I want
// to change all bits to prevent problem with concurrent write that
// could cross cache line. But this should not happen with the
// current code and driver.
*err = 0xFFFF; *err = 0xFFFF;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){ for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){ for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
...@@ -725,12 +729,13 @@ __global__ void k_take_3(const int d0, const int d1, const int d2, ...@@ -725,12 +729,13 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
// Pointor to 1 int on the device // Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error // Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it exist, it should always be 0 // When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error // So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use // This prevent us from setting it to 0 before each use
static int* err_var = NULL; static int* err_var = NULL;
//PyObject* PyArray_TakeFrom(PyArrayObject* self, PyObject* indices, int axis, PyArrayObject* ret, NPY_CLIPMODE clipmode) // We try to be similat to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap) //TODO: support other clip mode then raise(clip, wrap)
//TODO: what if the indices take more then 32 bits? //TODO: what if the indices take more then 32 bits?
//self is the input that we copy data from. //self is the input that we copy data from.
...@@ -820,7 +825,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -820,7 +825,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
//TODO: implement the default and other axis //TODO: implement the default and other axis
PyObject * axis_iobj = PyNumber_Long(axis_obj); PyObject * axis_iobj = PyNumber_Long(axis_obj);
if (!axis_iobj) { if (!axis_iobj) {
PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertisable to a long"); PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertable to a long");
Py_DECREF(indices_obj); Py_DECREF(indices_obj);
return NULL; return NULL;
} }
...@@ -849,12 +854,6 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -849,12 +854,6 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
} }
} }
if (!out) { if (!out) {
int total_elements = nb_indices;
for(int i=1;i<self->nd;i++)
total_elements*=CudaNdarray_HOST_DIMS(self)[i];
// total_elements now contains the size of the array, in reals
int total_size = total_elements * sizeof(real);
out = (CudaNdarray*)CudaNdarray_New(); out = (CudaNdarray*)CudaNdarray_New();
if (!out){ if (!out){
Py_DECREF(indices_obj); Py_DECREF(indices_obj);
...@@ -939,7 +938,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -939,7 +938,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break; break;
case 2: case 2:
{ {
dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1], 1, 1); dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], 512), 1, 1);
if (verbose) if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d," printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
...@@ -963,8 +962,9 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -963,8 +962,9 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break; break;
case 3: case 3:
{ {
dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1], int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], 512);
CudaNdarray_HOST_DIMS(out)[2], 1); int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], 512 / ty);
dim3 n_threads(tx, ty, 1);
if (verbose) if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d," printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
...@@ -1003,6 +1003,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1003,6 +1003,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;
} }
//-10 could be any value different then 0.
int cpu_err_var=-10; int cpu_err_var=-10;
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int), err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
......
...@@ -850,11 +850,19 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -850,11 +850,19 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
self).__init__(name) self).__init__(name)
def test_adv_sub1_fast(self): def test_adv_sub1_fast(self):
""" We check that we correctly used the fast version""" """We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly
"""
rand = numpy.random.rand rand = numpy.random.rand
# The variable fast is used to set the member perform_using_take of
# the Op. It is only useful for testing that we use the fast
# version when we should. Users should not use it.
for data, idx, fast in [(rand(70000), range(70000), True), for data, idx, fast in [(rand(70000), range(70000), True),
(rand(70000, 5), range(70000), True), (rand(70000, 5), range(70000), True),
(rand(70000, 2, 3), range(70000), True), (rand(70000, 2, 3), range(70000), True),
(rand(1025, 1025), [5, 10], True),
(rand(3, 1025, 1026), [1, 2], True),
(rand(4, 5), [2, 3], True), (rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True), (rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2, (rand(4, 2, 3), [3, 3, 1, 1, 2,
...@@ -872,7 +880,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -872,7 +880,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# Test with c_contiguous input # Test with c_contiguous input
t = self.adv_sub1()(n, idx) t = self.adv_sub1()(n, idx)
t.owner.op.assert_fast = True # input c_contiguous, so we reshape t.owner.op.perform_using_take = True # input c_contiguous, so we reshape
val = self.eval_output_and_check(t, list=True) val = self.eval_output_and_check(t, list=True)
val = numpy.asarray(val) val = numpy.asarray(val)
...@@ -882,7 +890,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -882,7 +890,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# Test with input strided # Test with input strided
t = self.adv_sub1()(n[::-1], idx) t = self.adv_sub1()(n[::-1], idx)
t.owner.op.assert_fast = fast t.owner.op.perform_using_take = fast
val = theano.function([], t, mode=self.mode)() val = theano.function([], t, mode=self.mode)()
val = numpy.asarray(val) val = numpy.asarray(val)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论