提交 9afcee26 authored 作者: Frederic Bastien's avatar Frederic Bastien

created an fusion optimizer for the gpu.

上级 f1a3bae9
...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion', ...@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization", "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
BoolParam(True)) BoolParam(True))
AddConfigVar('gpu.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
BoolParam(True))
#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
AddConfigVar('lib.amdlibm', AddConfigVar('lib.amdlibm',
"Use amd's amdlibm numerical library", "Use amd's amdlibm numerical library",
BoolParam(False)) BoolParam(False))
......
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import sys import sys
import theano import theano
import numpy import numpy
...@@ -569,3 +572,21 @@ def local_gpu_join(node): ...@@ -569,3 +572,21 @@ def local_gpu_join(node):
# After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
#optdb.register('InplaceGpuBlasOpt',
# EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
# max_use_ratio=5),
# 70.0, 'fast_run', 'inplace')
#GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
...@@ -7,8 +7,8 @@ import numpy ...@@ -7,8 +7,8 @@ import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda
if cuda_ndarray.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.compile.mode import theano.compile.mode
...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin(): ...@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
# from a bug in normal sampling # from a bug in normal sampling
_a = numpy.asarray([[1,2],[3,4]],dtype='float32') _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32') _b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
c = tensor.join(1,a,b) c = tensor.join(1,a,b)
...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
# from a bug in gpu normal sampling # from a bug in gpu normal sampling
_a = numpy.asarray([1,2,3,4],dtype='float32') _a = numpy.asarray([1,2,3,4],dtype='float32')
_b = numpy.asarray([5,6,7,8],dtype='float32') _b = numpy.asarray([5,6,7,8],dtype='float32')
a = cuda_ndarray.shared_constructor(_a) a = cuda.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b) b = cuda.shared_constructor(_b)
a_prime = tensor.cos(a) a_prime = tensor.cos(a)
b_prime = tensor.sin(b) b_prime = tensor.sin(b)
...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): ...@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
assert numpy.allclose(numpy.asarray(f()), concat) assert numpy.allclose(numpy.asarray(f()), concat)
def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly"""
shape = (3,4)
a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
b = tensor.fmatrix()
c = tensor.fmatrix()
f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
topo = f.maker.env.toposort()
for i, node in enumerate(topo):
print >> sys.stdout, i, node
assert len(topo)==4
assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors
f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -2096,118 +2096,125 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot') ...@@ -2096,118 +2096,125 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# ############### # ###############
# # Loop fusion # # # Loop fusion #
# ############### # ###############
def local_elemwise_fusion_op(OP):
def local_elemwise_fusion(node):
""" """
As part of specialisation, we fusion two consecutif elemwise op of the same shape. We parametrise it to make it work for Elemwise and GpuElemwise op.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
""" """
# META TODO: PUT THESE THINGS IN TRAC, NOT TODO NOTES!! def local_fuse(node):
# TODO: use broadcast flag? """
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
# TODO: don't do this optimization as a localOptimizer. Analyze the graph in terms of For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
# elemwise subgraphs, and then replace each subgraph with a Composite version. The number of dimension is validated at call time by theano itself.
# TODO: use malloc and copy to transfer arguments that don't fit within the parameter space """
# of 256 bytes # META TODO: PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
# # TODO: use broadcast flag?
# TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
# TODO: Related: Support composites with multiple outputs
# TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the # TODO: don't do this optimization as a localOptimizer. Analyze the graph in terms of
# data anyway... might as well sum it up while we're at it (this can be trickier than i'm # elemwise subgraphs, and then replace each subgraph with a Composite version.
# making it seound here. The data-traversal should be done contiguously, and the summing-up
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
# dimension)
if not isinstance(node.op, T.Elemwise): # TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
return False # of 256 bytes
nb_elemwise=0 #
inputs=[]#inputs of the new Elemwise op. # TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
s_inputs = []#inputs of the new scalar op. # TODO: Related: Support composites with multiple outputs
s_g=[]#graph of scalar, what will by done in the inner loop.
for i in node.inputs:
do_fusion = False
catch = False
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True
try:
s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
s_op=i.owner.op.scalar_op(*s_input)
i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
["x" for x in i.owner.inputs],
"z",{})
except MethodNotDefined:
catch = True
except NotImplementedError:
catch = True
if catch:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
do_fusion=False
if do_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False
otype = node.outputs[0].type # TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the
s_new_out=node.op.scalar_op(*s_g) # data anyway... might as well sum it up while we're at it (this can be trickier than i'm
try: # making it seound here. The data-traversal should be done contiguously, and the summing-up
s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code", # might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
["x" for x in s_g], # dimension)
"z",{})
except MethodNotDefined:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
except NotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
#create the composite op. if not isinstance(node.op, OP):
C = scalar.Composite(s_inputs,[s_new_out]) return False
nb_elemwise=0
#create the new node. inputs=[]#inputs of the new Elemwise op.
n=T.Elemwise(C).make_node(*inputs) s_inputs = []#inputs of the new scalar op.
assert len(n.outputs)==1 s_g=[]#graph of scalar, what will by done in the inner loop.
assert node.outputs[0].dtype==n.outputs[0].dtype for i in node.inputs:
do_fusion = False
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function. catch = False
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much. if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
if True: #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
argument_limit = 240 # 16 bytes are used for block and thread coords etc. do_fusion=True
#TODO: read in from architecture to make this 4 or 8 try:
int_size = 8 s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
ptr_size = 8 s_op=i.owner.op.scalar_op(*s_input)
argument_size = int_size #for numels i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
argument_size += int_size * inputs[0].type.ndim # for the shape ["x" for x in i.owner.inputs],
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs) "z",{})
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs) except MethodNotDefined:
if argument_size >= argument_limit: catch = True
_logger.info('loop fusion failed because Op would exceed kernel argument limit.') except NotImplementedError:
catch = True
if catch:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
do_fusion=False
if do_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!" otype = node.outputs[0].type
return n.outputs s_new_out=node.op.scalar_op(*s_g)
try:
s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code",
["x" for x in s_g],
"z",{})
except MethodNotDefined:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
except NotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
#create the composite op.
C = scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=OP(C).make_node(*inputs)
assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True:
argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size = 8
ptr_size = 8
argument_size = int_size #for numels
argument_size += int_size * inputs[0].type.ndim # for the shape
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
if argument_size >= argument_limit:
_logger.info('loop fusion failed because Op would exceed kernel argument limit.')
return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs
return local_fuse
local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)
class FusionOptimizer(Optimizer): class FusionOptimizer(Optimizer):
"""Graph optimizer for Fusion of elemwise operations""" """Graph optimizer for Fusion of elemwise operations"""
def __init__(self): def __init__(self, local_optimizer):
Optimizer.__init__(self) Optimizer.__init__(self)
self.optimizer = local_optimizer
def add_requirements(self, env): def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate()) env.extend(toolbox.ReplaceValidate())
...@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer): ...@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer):
nodelist = list(env.toposort()) nodelist = list(env.toposort())
did_something = False did_something = False
for node in nodelist: for node in nodelist:
new_outputs = local_elemwise_fusion(node) new_outputs = self.optimizer(node)
if new_outputs: if new_outputs:
assert len(new_outputs) == len(node.outputs) assert len(new_outputs) == len(node.outputs)
try: try:
...@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer): ...@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer):
if config.tensor.local_elemwise_fusion: if config.tensor.local_elemwise_fusion:
_logger.debug("enabling optimization fusion elemwise in fast_run") _logger.debug("enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion') compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else: else:
_logger.debug("not enabling optimization fusion elemwise in fast_run") _logger.debug("not enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fusion', 'local_elemwise_fusion') compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论