提交 acb1a0e1 authored 作者: Frederic's avatar Frederic

fix GpuSoftmaxWithBias when the input have size 0.

上级 f16aee3d
...@@ -60,6 +60,8 @@ Crash Fix ...@@ -60,6 +60,8 @@ Crash Fix
element-wise fusion optimization when upcasting some inputs to element-wise fusion optimization when upcasting some inputs to
float32 (to compute them on the GPU). float32 (to compute them on the GPU).
(Frederic B., reported by Sander Dieleman) (Frederic B., reported by Sander Dieleman)
* GpuSoftmaxWithBias with shape (0, N) with N > 1.
(Frédéric B., reported by Razvan P.)
============= =============
Release Notes Release Notes
......
...@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (4,) + inline_softmax.code_version return (5,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, b = inp x, b = inp
...@@ -461,10 +461,12 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -461,10 +461,12 @@ class GpuSoftmaxWithBias (GpuOp):
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024); int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float); int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{
kSoftmaxWithBias_%(nodename)s kSoftmaxWithBias_%(nodename)s
<<< <<<
// todo: cap these at the card limits, implement loops in kernel // todo: cap these at the card limits,
// implement loops in kernel
n_blocks, n_blocks,
n_threads, n_threads,
n_shared_bytes n_shared_bytes
...@@ -485,10 +487,14 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -485,10 +487,14 @@ class GpuSoftmaxWithBias (GpuOp):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s; %(fail)s;
} }
} }
}
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
......
...@@ -142,7 +142,10 @@ def test_softmax_with_bias(): ...@@ -142,7 +142,10 @@ def test_softmax_with_bias():
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED) TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
""" """
x = T.fmatrix('x') x = T.fmatrix('x')
z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:])) # We can't use zeros_like(x[0,::]) as this don't allow to test with
# 0 shape.
z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
x.shape[1]))
f = theano.function([x],z, mode=mode_without_gpu) f = theano.function([x],z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu) f_gpu = theano.function([x],z, mode=mode_with_gpu)
...@@ -165,6 +168,7 @@ def test_softmax_with_bias(): ...@@ -165,6 +168,7 @@ def test_softmax_with_bias():
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2<<15, 5) cmp(2<<15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(0, 10)
cmp(4, 1000, True) cmp(4, 1000, True)
cmp(4, 1024, True) cmp(4, 1024, True)
cmp(4, 2000, True) cmp(4, 2000, True)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论