提交 fb973a61 authored 作者: Frederic's avatar Frederic

make CudaNdarray_TakeFrom don't crash on older GPU.

上级 e6a2645c
...@@ -11,7 +11,7 @@ from theano import tensor, scalar, config ...@@ -11,7 +11,7 @@ from theano import tensor, scalar, config
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp, device_properties
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import filter as type_support_filter from theano.sandbox.cuda import filter as type_support_filter
...@@ -1938,6 +1938,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1938,6 +1938,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#If True or False, we assert that we use the take version or not #If True or False, we assert that we use the take version or not
#If None, we choose the best one applicable #If None, we choose the best one applicable
perform_using_take = None perform_using_take = None
max_threads = 0
def make_node(self, x, ilist): def make_node(self, x, ilist):
x_ = as_cuda_ndarray_variable(x) x_ = as_cuda_ndarray_variable(x)
...@@ -1977,9 +1978,18 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1977,9 +1978,18 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
idx = idx.view("float32") idx = idx.view("float32")
idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx) idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
if self.max_threads == 0:
num = theano.sandbox.cuda.use.device_number
if device_properties(num)['regsPerBlock'] < (8192 * 2):
self.max_threads = 256
else:
self.max_threads = 512
o = x.take(idx, o = x.take(idx,
0, # axis 0, # axis
out_[0][0]) # return out_[0][0], # return
"raise",
self.max_threads)
if x is not x_orig: if x is not x_orig:
o = o.reshape(out_shape) o = o.reshape(out_shape)
out[0] = o out[0] = o
......
...@@ -758,8 +758,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -758,8 +758,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
PyObject * axis_obj = Py_None; PyObject * axis_obj = Py_None;
PyObject * out_obj = Py_None; PyObject * out_obj = Py_None;
PyObject * clipmode_obj = NULL; PyObject * clipmode_obj = NULL;
if (! PyArg_ParseTuple(args, "O|OOO", &indices_obj, &axis_obj, int max_threads = 1; // max threads per blocks
&out_obj, &clipmode_obj))
if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
&out_obj, &clipmode_obj, &max_threads))
return NULL; return NULL;
//Check argument indices //Check argument indices
...@@ -937,13 +939,16 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -937,13 +939,16 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
} }
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
switch (self->nd) { switch (self->nd) {
case 1: case 1:
{ {
dim3 n_threads(1, 1, 1); dim3 n_threads(1, 1, 1);
if (verbose) if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d," printf("cudaGetLastError=%d, nd=%d"
" kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
self->nd, cudaGetLastError(),
n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
k3<<<n_blocks, n_threads>>>( k3<<<n_blocks, n_threads>>>(
dims[0], dims[0],
...@@ -964,11 +969,15 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -964,11 +969,15 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break; break;
case 2: case 2:
{ {
dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], 512), 1, 1); dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
if (verbose) if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d," printf("cudaGetLastError=%d, nd=%d"
" kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
cudaGetLastError(), self->nd,
n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
k3<<<n_blocks, n_threads>>>( k3<<<n_blocks, n_threads>>>(
dims[0], //dimensions dims[0], //dimensions
dims[1], dims[1],
...@@ -988,12 +997,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -988,12 +997,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
break; break;
case 3: case 3:
{ {
int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], 512); int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], 512 / ty); int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
dim3 n_threads(tx, ty, 1); dim3 n_threads(tx, ty, 1);
if (verbose) if (verbose)
printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d," printf("cudaGetLastError=%d, nd=%d"
" kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
" n_threads.x=%i, n_threads.y=%i)\n", " n_threads.x=%i, n_threads.y=%i)\n",
self->nd, cudaGetLastError(),
n_blocks.x, n_blocks.y, n_threads.x, n_threads.y); n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
k3<<<n_blocks, n_threads>>>( k3<<<n_blocks, n_threads>>>(
dims[0], //dimensions dims[0], //dimensions
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论