提交 b7e7be45 authored 作者: lamblin's avatar lamblin

Merge pull request #1170 from nouiz/denormal

Denormal
...@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU ...@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU
Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*. Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
This can tell you if not enough of your graph is on the GPU or if there This can tell you if not enough of your graph is on the GPU or if there
is too much memory transfer. is too much memory transfer.
* Use nvcc options. nvcc support those options to speed up some
computations: `-ftz=true` to `flush denormals values to
zeros. <https://developer.nvidia.com/content/cuda-pro-tip-flush-denormals-confidence>`_,
`--prec-div=false` and `--prec-sqrt=false` option to speed up
division and square root operation by being less precise. You can
enable all of them with with the `nvcc.flags=--use_fast_math` Theano
flags or you can enable them individually as in this example
`nvcc.flags=-ftz=true --prec-div=false`.
.. _gpu_async: .. _gpu_async:
......
...@@ -255,10 +255,15 @@ class NVCC_compiler(object): ...@@ -255,10 +255,15 @@ class NVCC_compiler(object):
# compute capability? '--gpu-architecture=compute_13', # compute capability? '--gpu-architecture=compute_13',
# '--gpu-code=compute_13', # '--gpu-code=compute_13',
#nvcc argument #nvcc argument
preargs1 = [pa for pa in preargs preargs1 = []
if pa.startswith('-O') or for pa in preargs:
pa.startswith('--maxrregcount=') or for pattern in ['-O', '-arch=',
pa.startswith('-arch=')] '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math']:
if pa.startswith(pattern):
preargs1.append(pa)
preargs2 = [pa for pa in preargs preargs2 = [pa for pa in preargs
if pa not in preargs1] # other arguments if pa not in preargs1] # other arguments
......
...@@ -183,7 +183,9 @@ def test_softmax_with_bias(): ...@@ -183,7 +183,9 @@ def test_softmax_with_bias():
def cmp(n, m, catch=False): def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of """Some old card won't accet the configuration arguments of
this implementation.""" this implementation. For those cases set catch=True to skip
those errors.
"""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
...@@ -193,18 +195,22 @@ def test_softmax_with_bias(): ...@@ -193,18 +195,22 @@ def test_softmax_with_bias():
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert (e.args[0] == # Different CUDA driver have different error message
'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n' assert (e.args[0].startswith(
), e.args[0] 'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
cmp(2, 5) cmp(2, 5)
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2 << 15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(0, 10) cmp(0, 10)
cmp(4, 1000, True) cmp(784, 784)
cmp(4, 1024, True) cmp(4, 1000)
cmp(4, 2000, True) cmp(4, 1024)
cmp(4, 2024, True) cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True) cmp(4, 4074, True)
...@@ -227,8 +233,11 @@ def test_softmax(): ...@@ -227,8 +233,11 @@ def test_softmax():
cuda.nnet.GpuSoftmax) cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False): def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of """Some old card won't accept the configuration arguments of
this implementation.""" this implementation. For those cases set catch=True to skip
those errors.
"""
try: try:
#print "test_softmax",n,m #print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m) data = numpy.arange(n * m, dtype='float32').reshape(n, m)
...@@ -238,15 +247,20 @@ def test_softmax(): ...@@ -238,15 +247,20 @@ def test_softmax():
except RuntimeError, e: except RuntimeError, e:
if not catch: if not catch:
raise raise
assert (e.args[0] == # Different CUDA driver have different error message
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') assert (e.args[0].startswith(
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
#we need to test n>32*1024 to check that we make the block loop. #we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5) cmp(2, 5)
cmp(2 << 15, 5) cmp(2 << 15, 5)
cmp(4074, 400) cmp(4074, 400)
cmp(4, 1000, True) cmp(0, 10)
cmp(4, 1024, True) cmp(784, 784)
cmp(4, 2000, True) cmp(4, 1000)
cmp(4, 2024, True) cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True) cmp(4, 4074, True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论