提交 e5bb665a authored 作者: Frederic's avatar Frederic

Make GpuCrossentropySoftmaxArgmax1HotWithBias and…

Make GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx work for bigger inputs.
上级 83897d76
...@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
float * sm_data, int sms0, int sms1, float * sm_data, int sms0, int sms1,
float * am_data, int ams0) float * am_data, int ams0)
{ {
const int row = blockIdx.x; for (int row = blockIdx.x; row < M; row += gridDim.x){
const float * x = x_data + xs0 * row; const float * x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0]; const int y_idx = (int)y_idx_data[row * y_idxs0];
...@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
+ log(sum); + log(sum);
} }
am_data[row*ams0] = row_max_j; am_data[row*ams0] = row_max_j;
}
} }
""" """
...@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
} }
} }
{ {
int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0]; int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS);
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1; int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(float); int n_shared_bytes = 0; //n_threads * sizeof(float);
...@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n", "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
cudaGetErrorString(err)); "The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it // no need to decref output vars the cleanup code will do it
%(fail)s; %(fail)s;
} }
...@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (3,) return (4,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
...@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) return (6,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
...@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
} }
} }
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<< <<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1], CudaNdarray_HOST_DIMS(%(dx)s)[1],
...@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n", "Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err)); cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s; %(fail)s;
} }
} }
......
...@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
We check that we loop when their is too much threads We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
""" """
...@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
This is basic test for GpuCrossentropySoftmax1HotWithBiasDx This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
We check that we loop when their is too much threads We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
""" """
n_in = 1000 n_in = 1000
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
utt.seed_rng() utt.seed_rng()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论