提交 41a4a100 authored 作者: lamblin's avatar lamblin

Merge pull request #548 from nouiz/crash_size_0

Crash size 0
......@@ -60,6 +60,8 @@ Crash Fix
element-wise fusion optimization when upcasting some inputs to
float32 (to compute them on the GPU).
(Frederic B., reported by Sander Dieleman)
* GpuSoftmaxWithBias with shape (0, N) with N > 1.
(Frédéric B., reported by Razvan P.)
=============
Release Notes
......
......@@ -419,7 +419,7 @@ class GpuSoftmaxWithBias (GpuOp):
return [shape[0]]
def c_code_cache_version(self):
#return ()
return (4,) + inline_softmax.code_version
return (5,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, b = inp
......@@ -461,14 +461,16 @@ class GpuSoftmaxWithBias (GpuOp):
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
kSoftmaxWithBias_%(nodename)s
<<<
// todo: cap these at the card limits, implement loops in kernel
n_blocks,
n_threads,
n_shared_bytes
>>>(
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{
kSoftmaxWithBias_%(nodename)s
<<<
// todo: cap these at the card limits,
// implement loops in kernel
n_blocks,
n_threads,
n_shared_bytes
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
......@@ -480,13 +482,17 @@ class GpuSoftmaxWithBias (GpuOp):
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err));
%(fail)s;
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err));
%(fail)s;
}
}
}
assert(%(z)s);
......
import theano, numpy
from nose.plugins.skip import SkipTest
import numpy
import theano
import theano.tensor as T
import theano.tests.unittest_tools as utt
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode=='FAST_COMPILE':
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
"""
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
......@@ -28,52 +32,67 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
batch_size = 4097
n_out = 1250
if theano.config.mode!="DEBUG_MODE":
if theano.config.mode != "DEBUG_MODE":
n_in = 4098
n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y')
b = T.fvector('b')
#W = T.fmatrix('W')
#we precompute the dot with big shape before to allow the test of GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error (the launch timed out and was terminated) on GPU card not powerfull enought. We need the big shape to check for corner case.
#we precompute the dot with big shape before to allow the test of
#GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#powerfull enought. We need the big shape to check for corner
#case.
dot_result = T.fmatrix('dot_result')
# Seed numpy.random with config.unittests.rseed
utt.seed_rng()
xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32)
xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,),dtype='int32')
b_values = numpy.zeros((n_out,),dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
dot_value = numpy.asarray(numpy.dot(xx, W_values),dtype='float32')
dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
del W_values
p_y_given_x = T.nnet.softmax(dot_result+b)
p_y_given_x = T.nnet.softmax(dot_result + b)
y_pred = T.argmax(p_y_given_x, axis=-1)
loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
dW = T.grad(loss, dot_result)
classify = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW],
mode = mode_without_gpu)
classify_gpu = theano.function( inputs = [y,b,dot_result], outputs = [loss,y_pred,dW],
mode = mode_with_gpu)
classify = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_without_gpu)
classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
#theano.printing.debugprint(classify)
#theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()])
assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
for node in classify.maker.env.toposort()])
assert any([isinstance(node.op,
cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.env.toposort()])
out = classify(yy, b_values, dot_value)
gout = classify_gpu(yy, b_values, dot_value)
out=classify(yy,b_values,dot_value)
gout=classify_gpu(yy,b_values,dot_value)
assert len(out) == len(gout) == 3
assert numpy.allclose(out[0], gout[0])
assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
gout - out).max()
assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
for id, val in enumerate(out[1] -
gout[1])
if val != 0]
assert len(out)==len(gout)==3
assert numpy.allclose(out[0],gout[0])
assert numpy.allclose(out[2],gout[2],atol=3e-6),numpy.absolute(gout-out).max()
assert numpy.allclose(out[1],gout[1]),[(id,out[1][id],gout[1][id],val) for id,val in enumerate(out[1]-gout[1]) if val!=0]
def test_GpuCrossentropySoftmax1HotWithBiasDx():
"""
......@@ -90,24 +109,29 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
# Seed numpy.random with config.unittests.rseed
utt.seed_rng()
softmax_output_value = numpy.random.rand(batch_size, n_out).astype('float32')
dnll_value = numpy.asarray(numpy.random.rand(batch_size),dtype='float32')
softmax_output_value = numpy.random.rand(batch_size,
n_out).astype('float32')
dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
softmax_output = T.fmatrix()
softmax_output /= softmax_output.sum(axis=1).reshape(softmax_output.shape[1],1)
softmax_output /= softmax_output.sum(axis=1).reshape(
softmax_output.shape[1], 1)
op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
dnll_value,
softmax_output,
y_idx_value)
cpu_f = theano.function([softmax_output],op,mode = mode_without_gpu)
gpu_f = theano.function([softmax_output],op,mode = mode_with_gpu)
cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
#theano.printing.debugprint(cpu_f)
#theano.printing.debugprint(gpu_f)
assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in cpu_f.maker.env.toposort()])
assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in gpu_f.maker.env.toposort()])
assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
for node in cpu_f.maker.env.toposort()])
assert any([isinstance(node.op,
cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.env.toposort()])
cpu_out = cpu_f(softmax_output_value)
gpu_out = gpu_f(softmax_output_value)
......@@ -116,10 +140,11 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err/atol, rel_err/rtol)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print 'max err index:', max_i, max_i / batch_size, max_i % batch_size, max_i / n_out, max_i & n_out
print 'max err index:', max_i, max_i / batch_size,
print max_i % batch_size, max_i / n_out, max_i & n_out
print 'At that index:'
print 'err:', scaled_err.flatten()[max_i]
print 'absolute error:', abs_err.flatten()[max_i]
......@@ -139,69 +164,84 @@ def test_softmax_with_bias():
This is basic test for GpuSoftmaxWithBias
We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
"""
x = T.fmatrix('x')
z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))
f = theano.function([x],z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
def cmp(n,m, catch=False):
"""Some old card won't accet the configuration arguments of this implementation."""
# We can't use zeros_like(x[0,::]) as this don't allow to test with
# 0 shape.
z = T.nnet.softmax_with_bias(x, T.alloc(numpy.asarray(0, dtype='float32'),
x.shape[1]))
f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op == T.nnet.softmax_with_bias
assert isinstance(f_gpu.maker.env.toposort()[-2].op,
cuda.nnet.GpuSoftmaxWithBias)
def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
try:
#print "test_softmax",n,m
data = numpy.arange(n*m, dtype='float32').reshape(n,m)
out=f(data)
gout=f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout)
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e:
if not catch:
raise
assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n'
assert (e.args[0] ==
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
cmp(2, 5)
#we need to test n>32*1024 to check that we make the block loop.
cmp(2<<15, 5)
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(0, 10)
cmp(4, 1000, True)
cmp(4, 1024, True)
cmp(4, 2000, True)
cmp(4, 2024, True)
cmp(4, 4074, True)
def test_softmax():
"""
This is basic test for GpuSoftmax
We check that we loop when their is too much block
TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
TODO: check that we loop when their is too much thread.(THIS IS
NOT IMPLEMENTED)
"""
x = T.fmatrix('x')
z = T.nnet.softmax(x)
f = theano.function([x],z, mode=mode_without_gpu)
f_gpu = theano.function([x],z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op==T.nnet.softmax
assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
def cmp(n,m, catch=False):
"""Some old card won't accet the configuration arguments of this implementation."""
f = theano.function([x], z, mode=mode_without_gpu)
f_gpu = theano.function([x], z, mode=mode_with_gpu)
assert f.maker.env.toposort()[-1].op == T.nnet.softmax
assert isinstance(f_gpu.maker.env.toposort()[-2].op,
cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
try:
#print "test_softmax",n,m
data = numpy.arange(n*m, dtype='float32').reshape(n,m)
out=f(data)
gout=f_gpu(data)
assert numpy.allclose(out,gout),numpy.absolute(out-gout)
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
out = f(data)
gout = f_gpu(data)
assert numpy.allclose(out, gout), numpy.absolute(out - gout)
except RuntimeError, e:
if not catch:
raise
assert e.args[0]=='Cuda error: kSoftmax_node_0: invalid configuration argument.\n'
assert (e.args[0] ==
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
#we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5)
cmp(2<<15, 5)
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(4, 1000, True)
cmp(4, 1024, True)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论