提交 9afcee26 authored 作者: Frederic Bastien's avatar Frederic Bastien

created an fusion optimizer for the gpu.

上级 f1a3bae9
......@@ -45,6 +45,11 @@ AddConfigVar('tensor.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
BoolParam(True))
AddConfigVar('gpu.local_elemwise_fusion',
"Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
BoolParam(True))
#http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
AddConfigVar('lib.amdlibm',
"Use amd's amdlibm numerical library",
BoolParam(False))
......
import logging
_logger = logging.getLogger('theano.sandbox.cuda.opt')
import sys
import theano
import numpy
......@@ -569,3 +572,21 @@ def local_gpu_join(node):
# After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
#optdb.register('InplaceGpuBlasOpt',
# EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
# max_use_ratio=5),
# 70.0, 'fast_run', 'inplace')
#GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise)
if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else:
_logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
......@@ -7,8 +7,8 @@ import numpy
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
import theano.sandbox.cuda as cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.compile.mode
......@@ -76,8 +76,8 @@ def test_opt_gpujoin_onlyajoin():
# from a bug in normal sampling
_a = numpy.asarray([[1,2],[3,4]],dtype='float32')
_b = numpy.asarray([[5,6,7],[8,9,10]],dtype='float32')
a = cuda_ndarray.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b)
a = cuda.shared_constructor(_a)
b = cuda.shared_constructor(_b)
c = tensor.join(1,a,b)
......@@ -100,8 +100,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
# from a bug in gpu normal sampling
_a = numpy.asarray([1,2,3,4],dtype='float32')
_b = numpy.asarray([5,6,7,8],dtype='float32')
a = cuda_ndarray.shared_constructor(_a)
b = cuda_ndarray.shared_constructor(_b)
a = cuda.shared_constructor(_a)
b = cuda.shared_constructor(_b)
a_prime = tensor.cos(a)
b_prime = tensor.sin(b)
......@@ -125,6 +125,20 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
assert numpy.allclose(numpy.asarray(f()), concat)
def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly"""
shape = (3,4)
a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
b = tensor.fmatrix()
c = tensor.fmatrix()
f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
topo = f.maker.env.toposort()
for i, node in enumerate(topo):
print >> sys.stdout, i, node
assert len(topo)==4
assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors
f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
if __name__ == '__main__':
......
......@@ -2096,118 +2096,125 @@ register_canonicalize(local_transposed_dot, name='local_transposed_dot')
# ###############
# # Loop fusion #
# ###############
def local_elemwise_fusion(node):
def local_elemwise_fusion_op(OP):
"""
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
We parametrise it to make it work for Elemwise and GpuElemwise op.
"""
# META TODO: PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
# TODO: use broadcast flag?
def local_fuse(node):
"""
As part of specialisation, we fusion two consecutif elemwise op of the same shape.
# TODO: don't do this optimization as a localOptimizer. Analyze the graph in terms of
# elemwise subgraphs, and then replace each subgraph with a Composite version.
For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
The number of dimension is validated at call time by theano itself.
# TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
# of 256 bytes
#
# TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
# TODO: Related: Support composites with multiple outputs
"""
# META TODO: PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
# TODO: use broadcast flag?
# TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the
# data anyway... might as well sum it up while we're at it (this can be trickier than i'm
# making it seound here. The data-traversal should be done contiguously, and the summing-up
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
# dimension)
# TODO: don't do this optimization as a localOptimizer. Analyze the graph in terms of
# elemwise subgraphs, and then replace each subgraph with a Composite version.
if not isinstance(node.op, T.Elemwise):
return False
nb_elemwise=0
inputs=[]#inputs of the new Elemwise op.
s_inputs = []#inputs of the new scalar op.
s_g=[]#graph of scalar, what will by done in the inner loop.
for i in node.inputs:
do_fusion = False
catch = False
if i.owner and isinstance(i.owner.op,T.Elemwise) and len(i.clients)<=1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True
try:
s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
s_op=i.owner.op.scalar_op(*s_input)
i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
["x" for x in i.owner.inputs],
"z",{})
except MethodNotDefined:
catch = True
except NotImplementedError:
catch = True
if catch:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
do_fusion=False
if do_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False
# TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
# of 256 bytes
#
# TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
# TODO: Related: Support composites with multiple outputs
otype = node.outputs[0].type
s_new_out=node.op.scalar_op(*s_g)
try:
s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code",
["x" for x in s_g],
"z",{})
except MethodNotDefined:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
except NotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
# TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the
# data anyway... might as well sum it up while we're at it (this can be trickier than i'm
# making it seound here. The data-traversal should be done contiguously, and the summing-up
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
# dimension)
#create the composite op.
C = scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=T.Elemwise(C).make_node(*inputs)
assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True:
argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size = 8
ptr_size = 8
argument_size = int_size #for numels
argument_size += int_size * inputs[0].type.ndim # for the shape
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
if argument_size >= argument_limit:
_logger.info('loop fusion failed because Op would exceed kernel argument limit.')
if not isinstance(node.op, OP):
return False
nb_elemwise=0
inputs=[]#inputs of the new Elemwise op.
s_inputs = []#inputs of the new scalar op.
s_g=[]#graph of scalar, what will by done in the inner loop.
for i in node.inputs:
do_fusion = False
catch = False
if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True
try:
s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
s_op=i.owner.op.scalar_op(*s_input)
i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
["x" for x in i.owner.inputs],
"z",{})
except MethodNotDefined:
catch = True
except NotImplementedError:
catch = True
if catch:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(i.owner.op.scalar_op))
do_fusion=False
if do_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs
otype = node.outputs[0].type
s_new_out=node.op.scalar_op(*s_g)
try:
s_new_out.owner.op.c_code(s_new_out.owner, "test_presence_of_c_code",
["x" for x in s_g],
"z",{})
except MethodNotDefined:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
except NotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op." % str(s_new_out.owner.op))
return False
#create the composite op.
C = scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=OP(C).make_node(*inputs)
assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
if True:
argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size = 8
ptr_size = 8
argument_size = int_size #for numels
argument_size += int_size * inputs[0].type.ndim # for the shape
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
if argument_size >= argument_limit:
_logger.info('loop fusion failed because Op would exceed kernel argument limit.')
return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
return n.outputs
return local_fuse
local_elemwise_fusion = local_elemwise_fusion_op(T.Elemwise)
class FusionOptimizer(Optimizer):
"""Graph optimizer for Fusion of elemwise operations"""
def __init__(self):
def __init__(self, local_optimizer):
Optimizer.__init__(self)
self.optimizer = local_optimizer
def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate())
......@@ -2219,7 +2226,7 @@ class FusionOptimizer(Optimizer):
nodelist = list(env.toposort())
did_something = False
for node in nodelist:
new_outputs = local_elemwise_fusion(node)
new_outputs = self.optimizer(node)
if new_outputs:
assert len(new_outputs) == len(node.outputs)
try:
......@@ -2235,9 +2242,9 @@ class FusionOptimizer(Optimizer):
if config.tensor.local_elemwise_fusion:
_logger.debug("enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
else:
_logger.debug("not enabling optimization fusion elemwise in fast_run")
compile.optdb.register('elemwise_fusion', FusionOptimizer(), 71.00, 'fusion', 'local_elemwise_fusion')
compile.optdb.register('elemwise_fusion', FusionOptimizer(local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论