提交 9afcee26 authored 作者: Frederic Bastien's avatar Frederic Bastien

created an fusion optimizer for the gpu.

上级 f1a3bae9
...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion', ...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization", "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
BoolParam(True)) BoolParam(True))
AddConfigVar('gpu.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
BoolParam(True))
#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
AddConfigVar('lib.amdlibm', AddConfigVar('lib.amdlibm',
"Use amd's amdlibm numerical library", "Use amd's amdlibm numerical library",
BoolParam(False)) BoolParam(False))
......
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import sys import sys
import theano import theano
import numpy import numpy
...@@ -569,3 +572,21 @@ def local_gpu_join(node): ...@@ -569,3 +572,21 @@ def local_gpu_join(node):
# After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
#optdb.register('InplaceGpuBlasOpt',
# EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
# max_use_ratio=5),
# 70.0, 'fast_run', 'inplace')
#GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
...@@ -7,8 +7,8 @@ import numpy ...@@ -7,8 +7,8 @@ import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda
if cuda_ndarray.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.compile.mode import theano.compile.mode
...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin(): ...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
# from a bug in normal sampling # from a bug in normal sampling
_a = numpy.asarray([[1,2],[3,4]],dtype='float32') _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32') _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
c = tensor.join(1,a,b) c = tensor.join(1,a,b)
...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
# from a bug in gpu normal sampling # from a bug in gpu normal sampling
_a = numpy.asarray([1,2,3,4],dtype='float32') _a = numpy.asarray([1,2,3,4],dtype='float32')
_b = numpy.asarray([5,6,7,8],dtype='float32') _b = numpy.asarray([5,6,7,8],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
a_prime = tensor.cos(a) a_prime = tensor.cos(a)
b_prime = tensor.sin(b) b_prime = tensor.sin(b)
...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
assert numpy.allclose(numpy.asarray(f()), concat) assert numpy.allclose(numpy.asarray(f()), concat)
def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly"""
shape = (3,4)
a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
b = tensor.fmatrix()
c = tensor.fmatrix()
f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
topo = f.maker.env.toposort()
for i, node in enumerate(topo):
print >> sys.stdout, i, node
assert len(topo)==4
assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors
f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -2096,8 +2096,11 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot') ...@@ -2096,8 +2096,11 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# ############### # ###############
# # Loop fusion # # # Loop fusion #
# ############### # ###############
def local_elemwise_fusion_op(OP):
def local_elemwise_fusion(node): """
We parametrise it to make it work for Elemwise and GpuElemwise op.
"""
def local_fuse(node):
""" """
As part of specialisation, we fusion two consecutif elemwise op of the same shape. As part of specialisation, we fusion two consecutif elemwise op of the same shape.
...@@ -2123,7 +2126,7 @@ def local_elemwise_fusion(node): ...@@ -2123,7 +2126,7 @@ def local_elemwise_fusion(node):
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous # might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
# dimension) # dimension)
if not isinstance(node.op, T.Elemwise): if not isinstance(node.op, OP):
return False return False
nb_elemwise=0 nb_elemwise=0
inputs=[]#inputs of the new Elemwise op. inputs=[]#inputs of the new Elemwise op.
...@@ -2132,7 +2135,7 @@ def local_elemwise_fusion(node): ...@@ -2132,7 +2135,7 @@ def local_elemwise_fusion(node):
for i in node.inputs: for i in node.inputs:
do_fusion = False do_fusion = False
catch = False catch = False
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1: if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops. #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True do_fusion=True
try: try:
...@@ -2162,7 +2165,7 @@ def local_elemwise_fusion(node): ...@@ -2162,7 +2165,7 @@ def local_elemwise_fusion(node):
#if no inputs have are an elemwise, there is nothing to fuse. #if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0: if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse." # print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False return False
otype = node.outputs[0].type otype = node.outputs[0].type
...@@ -2182,7 +2185,7 @@ def local_elemwise_fusion(node): ...@@ -2182,7 +2185,7 @@ def local_elemwise_fusion(node):
C = scalar.Composite(s_inputs,[s_new_out]) C = scalar.Composite(s_inputs,[s_new_out])
#create the new node. #create the new node.
n=T.Elemwise(C).make_node(*inputs) n=OP(C).make_node(*inputs)
assert len(n.outputs)==1 assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype assert node.outputs[0].dtype==n.outputs[0].dtype
...@@ -2201,13 +2204,17 @@ def local_elemwise_fusion(node): ...@@ -2201,13 +2204,17 @@ def local_elemwise_fusion(node):
_logger.info('loop fusion failed because Op would exceed kernel argument limit.') _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
return False return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!" # print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs return n.outputs
return local_fuse
local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)
class FusionOptimizer(Optimizer): class FusionOptimizer(Optimizer):
"""Graph optimizer for Fusion of elemwise operations""" """Graph optimizer for Fusion of elemwise operations"""
def __init__(self): def __init__(self, local_optimizer):
Optimizer.__init__(self) Optimizer.__init__(self)
self.optimizer = local_optimizer
def add_requirements(self, env): def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate()) env.extend(toolbox.ReplaceValidate())
...@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer): ...@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer):
nodelist = list(env.toposort()) nodelist = list(env.toposort())
did_something = False did_something = False
for node in nodelist: for node in nodelist:
new_outputs = local_elemwise_fusion(node) new_outputs = self.optimizer(node)
if new_outputs: if new_outputs:
assert len(new_outputs) == len(node.outputs) assert len(new_outputs) == len(node.outputs)
try: try:
...@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer): ...@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer):
if config.tensor.local_elemwise_fusion: if config.tensor.local_elemwise_fusion:
_logger.debug("enabling optimization fusion elemwise in fast_run") _logger.debug("enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion') compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else: else:
_logger.debug("not enabling optimization fusion elemwise in fast_run") _logger.debug("not enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fusion', 'local_elemwise_fusion') compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论