提交 a1eef12d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2075 from nouiz/gpu_inc_sub_crash

[CRASH] fix crash on the GPU with {inc,set}_subtensor and broadcasting t...
...@@ -2888,7 +2888,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2888,7 +2888,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
returns a C code expression to copy source into view, and returns a C code expression to copy source into view, and
return 0 on success return 0 on success
""" """
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals() # On the CPU it unbroadcast based on the run time shapes. We
# need the same behavior on the GPU.
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s, 1)""" % locals()
def add_to_zview(self, name, x, fail): def add_to_zview(self, name, x, fail):
...@@ -2910,7 +2912,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2910,7 +2912,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version() parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version: if parent_version:
return parent_version + (0,) return parent_version + (1,)
return () return ()
......
...@@ -1002,7 +1002,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1002,7 +1002,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
return NULL; return NULL;
indices = (CudaNdarray*) CudaNdarray_New(); indices = (CudaNdarray*) CudaNdarray_New();
if (verbose) printf("ndarray after new\n"); if (verbose) printf("\nndarray after new\n");
if (! indices){ if (! indices){
Py_DECREF(indices_float32); Py_DECREF(indices_float32);
return NULL; return NULL;
...@@ -1140,6 +1140,13 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1140,6 +1140,13 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
} }
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
if(CudaNdarray_HOST_DIMS(out)[0] == 0){
// We take 0 elements, so no need for the rest of the code.
// This speed up that case AND fix crash otherwise.
free(dims);
Py_DECREF(indices);
return (PyObject *)out;
}
switch (self->nd) { switch (self->nd) {
case 1: case 1:
...@@ -1149,7 +1156,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1149,7 +1156,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
printf("cudaGetLastError=%d, nd=%d" printf("cudaGetLastError=%d, nd=%d"
" kernel config: (n_blocks.x=%d, n_blocks.y=%d," " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
self->nd, cudaGetLastError(), cudaGetLastError(), self->nd,
n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
k3<<<n_blocks, n_threads>>>( k3<<<n_blocks, n_threads>>>(
dims[0], dims[0],
...@@ -1205,7 +1212,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1205,7 +1212,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
printf("cudaGetLastError=%d, nd=%d" printf("cudaGetLastError=%d, nd=%d"
" kernel config: (n_blocks.x=%d, n_blocks.y=%d," " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
self->nd, cudaGetLastError(), cudaGetLastError(), self->nd,
n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
k3<<<n_blocks, n_threads>>>( k3<<<n_blocks, n_threads>>>(
dims[0], //dimensions dims[0], //dimensions
......
...@@ -967,6 +967,8 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -967,6 +967,8 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
# version when we should. Users should not use it. # version when we should. Users should not use it.
for shape, idx, fast in [((70000,), range(70000), True), for shape, idx, fast in [((70000,), range(70000), True),
((70000, 5), range(70000), True), ((70000, 5), range(70000), True),
((70000, 5), numpy.zeros((0,), 'int64'),
True),
((70000, 2, 3), range(70000), True), ((70000, 2, 3), range(70000), True),
((1025, 1025), [5, 10], True), ((1025, 1025), [5, 10], True),
((3, 1025, 1026), [1, 2], True), ((3, 1025, 1026), [1, 2], True),
......
...@@ -863,7 +863,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -863,7 +863,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
inc_slice(2, 1), inc_slice(2, 1),
(numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),)) (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
def test_advanced_inc_and_set(self): def test_inc_and_set_subtensor(self):
"""
Test increment and set with broadcast
"""
X = tensor.matrix(dtype=self.dtype)
y = set_subtensor(X[1::, 1::], 0)
f = self.function([X], [y],
op=self.inc_sub,
N=1)
x_ = numpy.ones((9, 9))
out = f(x_.astype('float32'))
res = x_.copy()
res[1::, 1::] = 0
assert numpy.allclose(out, res)
def test_advanced1_inc_and_set(self):
""" """
Test advanced increment and set. Test advanced increment and set.
""" """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论