提交 9afcee26 authored 作者: Frederic Bastien's avatar Frederic Bastien

created an fusion optimizer for the gpu.

上级 f1a3bae9
...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion', ...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization", "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
BoolParam(True)) BoolParam(True))
AddConfigVar('gpu.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
BoolParam(True))
#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
AddConfigVar('lib.amdlibm', AddConfigVar('lib.amdlibm',
"Use amd's amdlibm numerical library", "Use amd's amdlibm numerical library",
BoolParam(False)) BoolParam(False))
......
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import sys import sys
import theano import theano
import numpy import numpy
...@@ -569,3 +572,21 @@ def local_gpu_join(node): ...@@ -569,3 +572,21 @@ def local_gpu_join(node):
# After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
#optdb.register('InplaceGpuBlasOpt',
# EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
# max_use_ratio=5),
# 70.0, 'fast_run', 'inplace')
#GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
...@@ -7,8 +7,8 @@ import numpy ...@@ -7,8 +7,8 @@ import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda
if cuda_ndarray.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.compile.mode import theano.compile.mode
...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin(): ...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
# from a bug in normal sampling # from a bug in normal sampling
_a = numpy.asarray([[1,2],[3,4]],dtype='float32') _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32') _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
c = tensor.join(1,a,b) c = tensor.join(1,a,b)
...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
# from a bug in gpu normal sampling # from a bug in gpu normal sampling
_a = numpy.asarray([1,2,3,4],dtype='float32') _a = numpy.asarray([1,2,3,4],dtype='float32')
_b = numpy.asarray([5,6,7,8],dtype='float32') _b = numpy.asarray([5,6,7,8],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
a_prime = tensor.cos(a) a_prime = tensor.cos(a)
b_prime = tensor.sin(b) b_prime = tensor.sin(b)
...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
assert numpy.allclose(numpy.asarray(f()), concat) assert numpy.allclose(numpy.asarray(f()), concat)
def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly"""
shape = (3,4)
a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
b = tensor.fmatrix()
c = tensor.fmatrix()
f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
topo = f.maker.env.toposort()
for i, node in enumerate(topo):
print >> sys.stdout, i, node
assert len(topo)==4
assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors
f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
if __name__ == '__main__': if __name__ == '__main__':
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论