提交 bd00e506 authored 作者: Frederic's avatar Frederic

Many fix following the code review.

- add/update comments - rename assert_fast - fix crash - remove useless code This do change anything related to the int casted to float32.
上级 6c955ecf
......@@ -1891,7 +1891,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
"""
Implement AdvancedSubtensor1 on the gpu.
"""
assert_fast = None
perform_using_take = None
def make_node(self, x, ilist):
x_ = as_cuda_ndarray_variable(x)
......@@ -1910,30 +1910,29 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp
out, = out_
new_method = True
#TODO: if more then 3 dims, reshape the inputs if it is contiguous.
x_orig = x
if x.ndim > 3 and x.is_c_contiguous():
x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
out_shape = (len(idx),) + x_orig.shape[1:]
if x.ndim <= 3:
if self.assert_fast is not None:
assert self.assert_fast == True, (
# CudaNdarray.take only supports ndim <= 3
if self.perform_using_take is not None:
assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version")
# Support x with dimensions 1,2,3 only.
o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),
0, out_[0][0]) # idx, axis, return[, clipmode]
o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")), # idx
0, # axis
out_[0][0]) # return
if x is not x_orig:
o = o.reshape((len(idx),) + x_orig.shape[1:])
o = o.reshape(out_shape)
out[0] = o
else:
if self.assert_fast is not None:
assert self.assert_fast == False, (
"GpuAdvancedSubtensor1 didn't used the fast version")
if (out_[0][0] is None or out_[0][0].shape != (len(idx),) +
x.shape[1:]):
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
x.shape[1:])
if self.perform_using_take is not None:
assert self.perform_using_take == False, (
"GpuAdvancedSubtensor1 didn't use the fast version")
if out_[0][0] is None or out_[0][0].shape != out_shape:
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
else:
o = out_[0][0]
for (j, i) in enumerate(idx):
......
......@@ -712,6 +712,10 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
if (idx<0)
idx += dB0; // To allow negative indexing.
if ((idx < 0) || (idx >= dB0))
// Any value other the 0 probably work. But to be more safe, I want
// to change all bits to prevent problem with concurrent write that
// could cross cache line. But this should not happen with the
// current code and driver.
*err = 0xFFFF;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
......@@ -725,12 +729,13 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
// When it exist, it should always be 0
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
static int* err_var = NULL;
//PyObject* PyArray_TakeFrom(PyArrayObject* self, PyObject* indices, int axis, PyArrayObject* ret, NPY_CLIPMODE clipmode)
// We try to be similat to the PyArray_TakeFrom function
//http://docs.scipy.org/doc/numpy/reference/c-api.array.html
//TODO: support other clip mode then raise(clip, wrap)
//TODO: what if the indices take more then 32 bits?
//self is the input that we copy data from.
......@@ -820,7 +825,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
//TODO: implement the default and other axis
PyObject * axis_iobj = PyNumber_Long(axis_obj);
if (!axis_iobj) {
PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertisable to a long");
PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertable to a long");
Py_DECREF(indices_obj);
return NULL;
}
......@@ -849,12 +854,6 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
}
}
if (!out) {
int total_elements = nb_indices;
for(int i=1;i<self->nd;i++)
total_elements*=CudaNdarray_HOST_DIMS(self)[i];
// total_elements now contains the size of the array, in reals
int total_size = total_elements * sizeof(real);
out = (CudaNdarray*)CudaNdarray_New();
if (!out){
Py_DECREF(indices_obj);
......@@ -939,7 +938,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break;
case 2:
{
dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1], 1, 1);
dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], 512), 1, 1);
if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n",
......@@ -963,8 +962,9 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break;
case 3:
{
dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], 1);
int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], 512);
int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], 512 / ty);
dim3 n_threads(tx, ty, 1);
if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n",
......@@ -1003,6 +1003,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
Py_DECREF(out);
return NULL;
}
//-10 could be any value different then 0.
int cpu_err_var=-10;
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
......
......@@ -850,11 +850,19 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
self).__init__(name)
def test_adv_sub1_fast(self):
""" We check that we correctly used the fast version"""
"""We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly
"""
rand = numpy.random.rand
# The variable fast is used to set the member perform_using_take of
# the Op. It is only useful for testing that we use the fast
# version when we should. Users should not use it.
for data, idx, fast in [(rand(70000), range(70000), True),
(rand(70000, 5), range(70000), True),
(rand(70000, 2, 3), range(70000), True),
(rand(1025, 1025), [5, 10], True),
(rand(3, 1025, 1026), [1, 2], True),
(rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2,
......@@ -872,7 +880,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# Test with c_contiguous input
t = self.adv_sub1()(n, idx)
t.owner.op.assert_fast = True # input c_contiguous, so we reshape
t.owner.op.perform_using_take = True # input c_contiguous, so we reshape
val = self.eval_output_and_check(t, list=True)
val = numpy.asarray(val)
......@@ -882,7 +890,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
# Test with input strided
t = self.adv_sub1()(n[::-1], idx)
t.owner.op.assert_fast = fast
t.owner.op.perform_using_take = fast
val = theano.function([], t, mode=self.mode)()
val = numpy.asarray(val)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论