提交 b7e7be45 authored 作者: lamblin's avatar lamblin

Merge pull request #1170 from nouiz/denormal

Denormal
...@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU ...@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU
Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*. Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
This can tell you if not enough of your graph is on the GPU or if there This can tell you if not enough of your graph is on the GPU or if there
is too much memory transfer. is too much memory transfer.
* Use nvcc options. nvcc support those options to speed up some
computations: `-ftz=true` to `flush denormals values to
zeros. <https://developer.nvidia.com/content/cuda-pro-tip-flush-denormals-confidence>`_,
`--prec-div=false` and `--prec-sqrt=false` option to speed up
division and square root operation by being less precise. You can
enable all of them with with the `nvcc.flags=--use_fast_math` Theano
flags or you can enable them individually as in this example
`nvcc.flags=-ftz=true --prec-div=false`.
.. _gpu_async: .. _gpu_async:
......
...@@ -5,23 +5,30 @@ import StringIO ...@@ -5,23 +5,30 @@ import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel, inline_reduce_max,
inline_reduce_sum,
inline_softmax)
class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
nin=3 nin = 3
nout=3 nout = 3
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, x, b, y_idx): def make_node(self, x, b, y_idx):
nll = y_idx.type() #N.B. won't work when we don't cast y_idx to float anymore #N.B. won't work when we don't cast y_idx to float anymore
nll = y_idx.type()
sm = x.type() sm = x.type()
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
...@@ -85,7 +92,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -85,7 +92,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, b, y_idx = inp x, b, y_idx = inp
nll, sm, am = out nll, sm, am = out
classname=self.__class__.__name__ classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
sio = StringIO.StringIO() sio = StringIO.StringIO()
print >> sio, """ print >> sio, """
...@@ -106,12 +113,14 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -106,12 +113,14 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
} }
if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]) if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments"); PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,y_idx arguments");
%(fail)s; %(fail)s;
} }
if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0]) if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments"); PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,b arguments");
%(fail)s; %(fail)s;
} }
if ((NULL == %(nll)s) //initial condition if ((NULL == %(nll)s) //initial condition
...@@ -132,7 +141,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -132,7 +141,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s)); %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s));
if(!%(sm)s) if(!%(sm)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should pick it up. // no need to decref cnda_nll, the cleanup code should pick it up.
%(fail)s; %(fail)s;
} }
...@@ -144,7 +154,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -144,7 +154,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s)); %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!%(am)s) if(!%(am)s)
{ {
PyErr_SetString(PyExc_MemoryError, "failed to alloc am output"); PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll amd sm, the cleanup code should pick it up. // no need to decref nll amd sm, the cleanup code should pick it up.
%(fail)s; %(fail)s;
} }
...@@ -167,7 +178,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -167,7 +178,9 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n",
cudaGetErrorString(err));
// no need to decref output vars the cleanup code should pick them up. // no need to decref output vars the cleanup code should pick them up.
%(fail)s; %(fail)s;
} }
...@@ -181,26 +194,33 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -181,26 +194,33 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
""" """
nin=3 nin = 3
nout=1 nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op""" """Gradient wrt x of the CrossentropySoftmax1Hot Op"""
def __init__(self, **kwargs): def __init__(self, **kwargs):
Op.__init__(self,**kwargs) Op.__init__(self,**kwargs)
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, dy, sm, y_idx): def make_node(self, dy, sm, y_idx):
return Apply(self, [dy, sm, y_idx],[sm.type()]) return Apply(self, [dy, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) return (5,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
...@@ -221,7 +241,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -221,7 +241,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
} }
if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]) if (CudaNdarray_HOST_DIMS(%(dnll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{ {
PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]"); PyErr_SetString(PyExc_ValueError,
"dnll.shape[0] != y_idx.shape[0]");
%(fail)s; %(fail)s;
} }
if ((NULL == %(dx)s) if ((NULL == %(dx)s)
...@@ -265,7 +286,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -265,7 +286,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err));
%(fail)s; %(fail)s;
} }
} }
...@@ -305,23 +329,30 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -305,23 +329,30 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (GpuOp): class GpuSoftmax (GpuOp):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, x): def make_node(self, x):
return Apply(self, [x],[x.type()]) return Apply(self, [x], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) + inline_softmax.code_version return (7,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
...@@ -332,14 +363,15 @@ class GpuSoftmax (GpuOp): ...@@ -332,14 +363,15 @@ class GpuSoftmax (GpuOp):
PyErr_SetString(PyExc_ValueError, "rank error"); PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s; %(fail)s;
} }
if ((NULL == %(z)s) if ((NULL == %(z)s) ||
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1])) (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New(); %(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2, CudaNdarray_HOST_DIMS(%(x)s))) || CudaNdarray_alloc_contiguous(%(z)s, 2,
CudaNdarray_HOST_DIMS(%(x)s)))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = NULL; %(z)s = NULL;
...@@ -347,42 +379,48 @@ class GpuSoftmax (GpuOp): ...@@ -347,42 +379,48 @@ class GpuSoftmax (GpuOp):
} }
} }
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024); int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], 32 * 1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024); int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float); int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
kSoftmax_%(nodename)s if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
<<< {
// todo: cap these at the card limits, implement loops in kernel kSoftmax_%(nodename)s
n_blocks, <<<
n_threads, n_blocks,
n_shared_bytes n_threads,
>>>( n_shared_bytes
CudaNdarray_HOST_DIMS(%(x)s)[0], >>>(
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s), CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0], CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1] CudaNdarray_HOST_STRIDES(%(z)s)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kSoftmax_%(nodename)s", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError,
%(fail)s; "Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax_%(nodename)s", cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
} }
} }
assert(%(z)s); assert(%(z)s);
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return nvcc_kernel("kSoftmax_%s"%nodename, return nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
...@@ -395,9 +433,11 @@ class GpuSoftmax (GpuOp): ...@@ -395,9 +433,11 @@ class GpuSoftmax (GpuOp):
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
...@@ -405,25 +445,32 @@ class GpuSoftmax (GpuOp): ...@@ -405,25 +445,32 @@ class GpuSoftmax (GpuOp):
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuOp): class GpuSoftmaxWithBias (GpuOp):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
nin = 2 nin = 2
nout = 1 nout = 1
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, x, b): def make_node(self, x, b):
return Apply(self, [x, b],[x.type()]) return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (6,) + inline_softmax.code_version return (7,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, b = inp x, b = inp
...@@ -463,7 +510,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -463,7 +510,7 @@ class GpuSoftmaxWithBias (GpuOp):
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024); int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
//TODO, detect the maximum number of thread per block. //TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 1024); int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float); int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0) if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{ {
......
...@@ -255,10 +255,15 @@ class NVCC_compiler(object): ...@@ -255,10 +255,15 @@ class NVCC_compiler(object):
# compute capability? '--gpu-architecture=compute_13', # compute capability? '--gpu-architecture=compute_13',
# '--gpu-code=compute_13', # '--gpu-code=compute_13',
#nvcc argument #nvcc argument
preargs1 = [pa for pa in preargs preargs1 = []
if pa.startswith('-O') or for pa in preargs:
pa.startswith('--maxrregcount=') or for pattern in ['-O', '-arch=',
pa.startswith('-arch=')] '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math']:
if pa.startswith(pattern):
preargs1.append(pa)
preargs2 = [pa for pa in preargs preargs2 = [pa for pa in preargs
if pa not in preargs1] # other arguments if pa not in preargs1] # other arguments
......
...@@ -183,7 +183,9 @@ def test_softmax_with_bias(): ...@@ -183,7 +183,9 @@ def test_softmax_with_bias():
def cmp(n, m, catch=False): def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of """Some old card won't accet the configuration arguments of
this implementation.""" this implementation. For those cases set catch=True to skip
those errors.
"""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
...@@ -193,18 +195,22 @@ def test_softmax_with_bias(): ...@@ -193,18 +195,22 @@ def test_softmax_with_bias():
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert (e.args[0] == # Different CUDA driver have different error message
'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n' assert (e.args[0].startswith(
), e.args[0] 'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
cmp(2, 5) cmp(2, 5)
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2 << 15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(0, 10) cmp(0, 10)
cmp(4, 1000, True) cmp(784, 784)
cmp(4, 1024, True) cmp(4, 1000)
cmp(4, 2000, True) cmp(4, 1024)
cmp(4, 2024, True) cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True) cmp(4, 4074, True)
...@@ -227,8 +233,11 @@ def test_softmax(): ...@@ -227,8 +233,11 @@ def test_softmax():
cuda.nnet.GpuSoftmax) cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False): def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of """Some old card won't accept the configuration arguments of
this implementation.""" this implementation. For those cases set catch=True to skip
those errors.
"""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
...@@ -238,15 +247,20 @@ def test_softmax(): ...@@ -238,15 +247,20 @@ def test_softmax():
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert (e.args[0] == # Different CUDA driver have different error message
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') assert (e.args[0].startswith(
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5) cmp(2, 5)
cmp(2 << 15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(4, 1000, True) cmp(0, 10)
cmp(4, 1024, True) cmp(784, 784)
cmp(4, 2000, True) cmp(4, 1000)
cmp(4, 2024, True) cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True) cmp(4, 4074, True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论