提交 41a4a100 authored 作者: lamblin's avatar lamblin

Merge pull request #548 from nouiz/crash_size_0

Crash size 0
...@@ -60,6 +60,8 @@ Crash Fix ...@@ -60,6 +60,8 @@ Crash Fix
element-wise fusion optimization when upcasting some inputs to element-wise fusion optimization when upcasting some inputs to
float32 (to compute them on the GPU). float32 (to compute them on the GPU).
(Frederic B., reported by Sander Dieleman) (Frederic B., reported by Sander Dieleman)
* GpuSoftmaxWithBias with shape (0, N) with N > 1.
(Frédéric B., reported by Razvan P.)
============= =============
Release Notes Release Notes
......
...@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (4,) + inline_softmax.code_version return (5,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, b = inp x, b = inp
...@@ -461,14 +461,16 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -461,14 +461,16 @@ class GpuSoftmaxWithBias (GpuOp):
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024); int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float); int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
kSoftmaxWithBias_%(nodename)s {
<<< kSoftmaxWithBias_%(nodename)s
// todo: cap these at the card limits, implement loops in kernel <<<
n_blocks, // todo: cap these at the card limits,
n_threads, // implement loops in kernel
n_shared_bytes n_blocks,
>>>( n_threads,
n_shared_bytes
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
...@@ -480,13 +482,17 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -480,13 +482,17 @@ class GpuSoftmaxWithBias (GpuOp):
CudaNdarray_HOST_STRIDES(%(b)s)[0], CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
%(fail)s; "Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s;
}
} }
} }
assert(%(z)s); assert(%(z)s);
......
import theano, numpy from nose.plugins.skip import SkipTest
import numpy
import theano
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
if theano.config.mode=='FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
""" """
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
...@@ -28,52 +32,67 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -28,52 +32,67 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if theano.config.mode!="DEBUG_MODE": if theano.config.mode != "DEBUG_MODE":
n_in = 4098 n_in = 4098
n_out = 4099 n_out = 4099
x = T.fmatrix('x') x = T.fmatrix('x')
y = T.lvector('y') y = T.lvector('y')
b = T.fvector('b') b = T.fvector('b')
#W = T.fmatrix('W') #W = T.fmatrix('W')
#we precompute the dot with big shape before to allow the test of GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error (the launch timed out and was terminated) on GPU card not powerfull enought. We need the big shape to check for corner case. #we precompute the dot with big shape before to allow the test of
#GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#powerfull enought. We need the big shape to check for corner
#case.
dot_result = T.fmatrix('dot_result') dot_result = T.fmatrix('dot_result')
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
utt.seed_rng() utt.seed_rng()
xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32) xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32') #?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,),dtype='int32') yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,),dtype='float32') b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
dot_value = numpy.asarray(numpy.dot(xx, W_values),dtype='float32') dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
del W_values del W_values
p_y_given_x = T.nnet.softmax(dot_result+b) p_y_given_x = T.nnet.softmax(dot_result + b)
y_pred = T.argmax(p_y_given_x, axis=-1) y_pred = T.argmax(p_y_given_x, axis=-1)
loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
dW = T.grad(loss, dot_result) dW = T.grad(loss, dot_result)
classify = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW], classify = theano.function(inputs=[y, b, dot_result],
mode = mode_without_gpu) outputs=[loss, y_pred, dW],
classify_gpu = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW], mode=mode_without_gpu)
mode = mode_with_gpu) classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
#theano.printing.debugprint(classify) #theano.printing.debugprint(classify)
#theano.printing.debugprint(classify_gpu) #theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()]) assert any([isinstance(node.op,
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()]) T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,
cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.env.toposort()])
out = classify(yy, b_values, dot_value)
gout = classify_gpu(yy, b_values, dot_value)
out=classify(yy,b_values,dot_value) assert len(out) == len(gout) == 3
gout=classify_gpu(yy,b_values,dot_value) assert numpy.allclose(out[0], gout[0])
assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
gout - out).max()
assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
for id, val in enumerate(out[1] -
gout[1])
if val != 0]
assert len(out)==len(gout)==3
assert numpy.allclose(out[0],gout[0])
assert numpy.allclose(out[2],gout[2],atol=3e-6),numpy.absolute(gout-out).max()
assert numpy.allclose(out[1],gout[1]),[(id,out[1][id],gout[1][id],val) for id,val in enumerate(out[1]-gout[1]) if val!=0]
def test_GpuCrossentropySoftmax1HotWithBiasDx(): def test_GpuCrossentropySoftmax1HotWithBiasDx():
""" """
...@@ -90,24 +109,29 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -90,24 +109,29 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
utt.seed_rng() utt.seed_rng()
softmax_output_value = numpy.random.rand(batch_size, n_out).astype('float32') softmax_output_value = numpy.random.rand(batch_size,
dnll_value = numpy.asarray(numpy.random.rand(batch_size),dtype='float32') n_out).astype('float32')
dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size) y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
softmax_output = T.fmatrix() softmax_output = T.fmatrix()
softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1],1) softmax_output /= softmax_output.sum(axis=1).reshape(
softmax_output.shape[1], 1)
op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx( op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
dnll_value, dnll_value,
softmax_output, softmax_output,
y_idx_value) y_idx_value)
cpu_f = theano.function([softmax_output],op,mode = mode_without_gpu) cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
gpu_f = theano.function([softmax_output],op,mode = mode_with_gpu) gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
#theano.printing.debugprint(cpu_f) #theano.printing.debugprint(cpu_f)
#theano.printing.debugprint(gpu_f) #theano.printing.debugprint(gpu_f)
assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in cpu_f.maker.env.toposort()]) assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in gpu_f.maker.env.toposort()]) for node in cpu_f.maker.env.toposort()])
assert any([isinstance(node.op,
cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.env.toposort()])
cpu_out = cpu_f(softmax_output_value) cpu_out = cpu_f(softmax_output_value)
gpu_out = gpu_f(softmax_output_value) gpu_out = gpu_f(softmax_output_value)
...@@ -116,10 +140,11 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -116,10 +140,11 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
atol = 1e-6 atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol): if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out) abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err/atol, rel_err/rtol) scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax() max_i = scaled_err.argmax()
print 'max err index:', max_i, max_i / batch_size, max_i % batch_size, max_i / n_out, max_i & n_out print 'max err index:', max_i, max_i / batch_size,
print max_i % batch_size, max_i / n_out, max_i & n_out
print 'At that index:' print 'At that index:'
print 'err:', scaled_err.flatten()[max_i] print 'err:', scaled_err.flatten()[max_i]
print 'absolute error:', abs_err.flatten()[max_i] print 'absolute error:', abs_err.flatten()[max_i]
...@@ -139,69 +164,84 @@ def test_softmax_with_bias(): ...@@ -139,69 +164,84 @@ def test_softmax_with_bias():
This is basic test for GpuSoftmaxWithBias This is basic test for GpuSoftmaxWithBias
We check that we loop when their is too much block We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
""" """
x = T.fmatrix('x') x = T.fmatrix('x')
z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:])) # We can't use zeros_like(x[0,::]) as this don't allow to test with
# 0 shape.
f = theano.function([x],z, mode=mode_without_gpu) z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
f_gpu = theano.function([x],z, mode=mode_with_gpu) x.shape[1]))
assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias) f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu)
def cmp(n,m, catch=False): assert f.maker.env.toposort()[-1].op == T.nnet.softmax_with_bias
"""Some old card won't accet the configuration arguments of this implementation.""" assert isinstance(f_gpu.maker.env.toposort()[-2].op,
cuda.nnet.GpuSoftmaxWithBias)
def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n*m, dtype='float32').reshape(n,m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out=f(data) out = f(data)
gout=f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n' assert (e.args[0] ==
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
cmp(2, 5) cmp(2, 5)
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2<<15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(0, 10)
cmp(4, 1000, True) cmp(4, 1000, True)
cmp(4, 1024, True) cmp(4, 1024, True)
cmp(4, 2000, True) cmp(4, 2000, True)
cmp(4, 2024, True) cmp(4, 2024, True)
cmp(4, 4074, True) cmp(4, 4074, True)
def test_softmax(): def test_softmax():
""" """
This is basic test for GpuSoftmax This is basic test for GpuSoftmax
We check that we loop when their is too much block We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
""" """
x = T.fmatrix('x') x = T.fmatrix('x')
z = T.nnet.softmax(x) z = T.nnet.softmax(x)
f = theano.function([x],z, mode=mode_without_gpu) f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu) f_gpu = theano.function([x], z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op==T.nnet.softmax assert f.maker.env.toposort()[-1].op == T.nnet.softmax
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax) assert isinstance(f_gpu.maker.env.toposort()[-2].op,
cuda.nnet.GpuSoftmax)
def cmp(n,m, catch=False):
"""Some old card won't accet the configuration arguments of this implementation.""" def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n*m, dtype='float32').reshape(n,m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out=f(data) out = f(data)
gout=f_gpu(data) gout = f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout) assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n' assert (e.args[0] ==
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5) cmp(2, 5)
cmp(2<<15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(4, 1000, True) cmp(4, 1000, True)
cmp(4, 1024, True) cmp(4, 1024, True)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论