提交 b7e7be45 authored 作者: lamblin's avatar lamblin

Merge pull request #1170 from nouiz/denormal

Denormal
......@@ -284,6 +284,14 @@ Tips for Improving Performance on GPU
Check the line similar to *Spent Xs(X%) in cpu op, Xs(X%) in gpu op and Xs(X%) in transfer op*.
This can tell you if not enough of your graph is on the GPU or if there
is too much memory transfer.
* Use nvcc options. nvcc support those options to speed up some
computations: `-ftz=true` to `flush denormals values to
zeros. <https://developer.nvidia.com/content/cuda-pro-tip-flush-denormals-confidence>`_,
`--prec-div=false` and `--prec-sqrt=false` option to speed up
division and square root operation by being less precise. You can
enable all of them with with the `nvcc.flags=--use_fast_math` Theano
flags or you can enable them individually as in this example
`nvcc.flags=-ftz=true --prec-div=false`.
.. _gpu_async:
......
......@@ -255,10 +255,15 @@ class NVCC_compiler(object):
# compute capability? '--gpu-architecture=compute_13',
# '--gpu-code=compute_13',
#nvcc argument
preargs1 = [pa for pa in preargs
if pa.startswith('-O') or
pa.startswith('--maxrregcount=') or
pa.startswith('-arch=')]
preargs1 = []
for pa in preargs:
for pattern in ['-O', '-arch=',
'--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math']:
if pa.startswith(pattern):
preargs1.append(pa)
preargs2 = [pa for pa in preargs
if pa not in preargs1] # other arguments
......
......@@ -183,7 +183,9 @@ def test_softmax_with_bias():
def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
this implementation. For those cases set catch=True to skip
those errors.
"""
try:
#print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
......@@ -193,18 +195,22 @@ def test_softmax_with_bias():
except RuntimeError, e:
if not catch:
raise
assert (e.args[0] ==
'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n'
), e.args[0]
# Different CUDA driver have different error message
assert (e.args[0].startswith(
'Cuda error: kSoftmaxWithBias_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmaxWithBias_node_0: invalid argument.\n'))
cmp(2, 5)
#we need to test n>32*1024 to check that we make the block loop.
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(0, 10)
cmp(4, 1000, True)
cmp(4, 1024, True)
cmp(4, 2000, True)
cmp(4, 2024, True)
cmp(784, 784)
cmp(4, 1000)
cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True)
......@@ -227,8 +233,11 @@ def test_softmax():
cuda.nnet.GpuSoftmax)
def cmp(n, m, catch=False):
"""Some old card won't accet the configuration arguments of
this implementation."""
"""Some old card won't accept the configuration arguments of
this implementation. For those cases set catch=True to skip
those errors.
"""
try:
#print "test_softmax",n,m
data = numpy.arange(n * m, dtype='float32').reshape(n, m)
......@@ -238,15 +247,20 @@ def test_softmax():
except RuntimeError, e:
if not catch:
raise
assert (e.args[0] ==
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n')
# Different CUDA driver have different error message
assert (e.args[0].startswith(
'Cuda error: kSoftmax_node_0: invalid configuration argument.\n') or
e.args[0].startswith('Cuda error: kSoftmax_node_0: invalid argument.\n'))
#we need to test n>32*1024 to check that we make the block loop.
cmp(2, 5)
cmp(2 << 15, 5)
cmp(4074, 400)
cmp(4, 1000, True)
cmp(4, 1024, True)
cmp(4, 2000, True)
cmp(4, 2024, True)
cmp(0, 10)
cmp(784, 784)
cmp(4, 1000)
cmp(4, 1024)
cmp(4, 2000)
cmp(4, 2024)
#GTX285 don't have enough shared mem for this case.
cmp(4, 4074, True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论