提交 93dfe094 authored 作者: Frederic Bastien's avatar Frederic Bastien

Refactor fusion and make it fuse one more case.

Their was duplicat code to test the maximum number of parameter to a GpuElemwise op. When fusing GpuElemwise with a new inputs, we check that we don't bust the maximum number of inputs for that ops. In the past, we make the check after and their where case where we could have fused in some of the inputs.
上级 8a45c933
Trunk sin last release
------
* Sparse type is now supported by the shape op and the ShapeFeature optimizer work correctly with them.
* fuse GpuElemwise more often(in the case where their is too many inputs that fusing all of them would bust the 256 bytes limits of parameter to gpu function)
Theano 0.3 (2010-11-23) Theano 0.3 (2010-11-23)
----------------------- -----------------------
......
...@@ -729,8 +729,47 @@ optdb.register('InplaceGpuBlasOpt', ...@@ -729,8 +729,47 @@ optdb.register('InplaceGpuBlasOpt',
max_use_ratio=5), max_use_ratio=5),
70.0, 'fast_run', 'inplace') 70.0, 'fast_run', 'inplace')
def max_inputs_to_GpuElemwise(node):
"""
return the maximum number of input this Apply node to an GpuElemwise can accept.
This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
"""
#TODO: detect the size of gpu pointeur and c int.
int_size = 8
ptr_size = 8
argument_limit = 256 # if was 240, with this note: 16 bytes are used for block and thread coords etc.
size_param_mandatory = int_size #for numels
size_param_mandatory += int_size * node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
nb_bytes_avail = argument_limit-size_param_mandatory
nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
return max_nb_inputs
def split_huge_add_or_mul(node):
"""
For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
We don't want node in the graph that can't execute
as this break DebugMode.
This should not happen for other GpuElemwise as their is only the fusion
that can generate op with too much input and it check for that.
"""
if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
while len(node.inputs)>max_nb_inputs:
inner_op = []
for i in range(0,len(node.inputs),max_nb_inputs):
inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
node = node.op(*inner_op).owner
return node
#GpuElemwise fusion #GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise) gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(GpuElemwise, max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion: if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion') compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion')
...@@ -775,42 +814,3 @@ def local_gpualloc(node): ...@@ -775,42 +814,3 @@ def local_gpualloc(node):
#if old_out.type != new_out.type: #if old_out.type != new_out.type:
#import pdb; pdb.set_trace() #import pdb; pdb.set_trace()
return [new_out] return [new_out]
def max_inputs_to_GpuElemwise(node):
"""
return the maximum number of input this Apply node to an GpuElemwise can accept.
This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
"""
#TODO: detect the size of gpu pointeur and c int.
int_size = 8
ptr_size = 8
argument_limit = 256 # if was 240, with this note: 16 bytes are used for block and thread coords etc.
size_param_mandatory = int_size #for numels
size_param_mandatory += int_size * node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
nb_bytes_avail = argument_limit-size_param_mandatory
nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
return max_nb_inputs
def split_huge_add_or_mul(node):
"""
For add and mul, it can happen that we have too much input
That will make nvcc fail compilation of our current code.
We don't want node in the graph that can't execute
as this break DebugMode.
This should not happen for other GpuElemwise as their is only the fusion
that can generate op with too much input and it check for that.
"""
if node.op.scalar_op in (scal.add, scal.mul):
max_nb_inputs = max_inputs_to_GpuElemwise(node)
while len(node.inputs)>max_nb_inputs:
inner_op = []
for i in range(0,len(node.inputs),max_nb_inputs):
inner_op.append(node.op(*node.inputs[i:i+max_nb_inputs]))
node = node.op(*inner_op).owner
return node
...@@ -155,6 +155,27 @@ def test_print_op(): ...@@ -155,6 +155,27 @@ def test_print_op():
assert topo[3].op == cuda.host_from_gpu assert topo[3].op == cuda.host_from_gpu
f(numpy.random.random((5,5)).astype('float32')) f(numpy.random.random((5,5)).astype('float32'))
def test_huge_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly
We check that we fuse one node with part of its input
in case their is too many inputs and that would make it bust the 256
bytes limits.
"""
shape = (3,4,5,6)
vars = [tensor.tanh(tensor.ftensor4()) for x in range(10)]
f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
topo = f.maker.env.toposort()
#theano.printing.debugprint(f)
#for i, node in enumerate(topo):
# print >> sys.stdout, i, node
assert len(topo)==10
assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2
assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Composite)
assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite)
#let debugmode catch errors
gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
def test_elemwise_fusion(): def test_elemwise_fusion():
""" Test the the GpuElemwise fusion work correctly""" """ Test the the GpuElemwise fusion work correctly"""
shape = (3,4) shape = (3,4)
......
...@@ -2919,9 +2919,13 @@ for i in range(1,len(p64)): print i, 64[i]-p64[i-1] ...@@ -2919,9 +2919,13 @@ for i in range(1,len(p64)): print i, 64[i]-p64[i-1]
# ############### # ###############
# # Loop fusion # # # Loop fusion #
# ############### # ###############
def local_elemwise_fusion_op(OP): def local_elemwise_fusion_op(OP, max_input_fct = lambda node: 1024):
""" """
We parametrise it to make it work for Elemwise and GpuElemwise op. We parametrise it to make it work for Elemwise and GpuElemwise op.
:param OP: GpuElemwise or Elemwise class (the one that we want to fuse)
:param max_input_fct: a fct that return the maximum number of input that this elemwise can take(usefull for the GpuElemwise)
""" """
def local_fuse(node): def local_fuse(node):
""" """
...@@ -2951,16 +2955,24 @@ def local_elemwise_fusion_op(OP): ...@@ -2951,16 +2955,24 @@ def local_elemwise_fusion_op(OP):
if not isinstance(node.op, OP): if not isinstance(node.op, OP):
return False return False
nb_elemwise=0
inputs=[]#inputs of the new Elemwise op. inputs=[]#inputs of the new Elemwise op.
s_inputs = []#inputs of the new scalar op. s_inputs = []#inputs of the new scalar op.
s_g=[]#graph of scalar, what will by done in the inner loop. s_g=[]#graph of scalar, what will by done in the inner loop.
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
max_nb_input = max_input_fct(node)
#print len(node.inputs),max_nb_input
new_nb_input = len(node.inputs)
for i in node.inputs: for i in node.inputs:
do_fusion = False do_fusion = False
catch = False catch = False
tmp_input=[]#used to remove duplicate input. tmp_input=[]#used to remove duplicate input.
tmp_scalar=[] tmp_scalar=[]
if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1: if ((new_nb_input+1)<=max_nb_input
and i.owner
and isinstance(i.owner.op, OP)
and len(i.clients)==1):
#if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops. #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
do_fusion=True do_fusion=True
try: try:
...@@ -2988,7 +3000,7 @@ def local_elemwise_fusion_op(OP): ...@@ -2988,7 +3000,7 @@ def local_elemwise_fusion_op(OP):
if do_fusion: if do_fusion:
#we should not put duplicate input into s_inputs and inputs #we should not put duplicate input into s_inputs and inputs
nb_elemwise+=1 new_nb_input+=1
inputs.extend(tmp_input) inputs.extend(tmp_input)
s_inputs.extend(tmp_scalar) s_inputs.extend(tmp_scalar)
s_g.append(s_op) s_g.append(s_op)
...@@ -3002,7 +3014,7 @@ def local_elemwise_fusion_op(OP): ...@@ -3002,7 +3014,7 @@ def local_elemwise_fusion_op(OP):
s_g.append(s) s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse. #if no inputs have are an elemwise, there is nothing to fuse.
if nb_elemwise==0: if new_nb_input==len(node.inputs):
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse." # print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
return False return False
...@@ -3029,22 +3041,9 @@ def local_elemwise_fusion_op(OP): ...@@ -3029,22 +3041,9 @@ def local_elemwise_fusion_op(OP):
assert len(n.outputs)==1 assert len(n.outputs)==1
assert node.outputs[0].dtype==n.outputs[0].dtype assert node.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function. if len(n.inputs)>max_nb_input:
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much. _logger.info('loop fusion failed because Op would exceed kernel argument limit.')
if OP != T.Elemwise: return False
argument_limit = 240 # 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size = 8
ptr_size = 8
argument_size = int_size #for numels
argument_size += int_size * inputs[0].type.ndim # for the shape
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.inputs)
argument_size += sum((ptr_size + int_size * i.type.ndim) for i in n.outputs)
if argument_size >= argument_limit:
_logger.info('loop fusion failed because Op would exceed kernel argument limit.')
return False
# print "local_elemwise_fusion: FUSED",nb_elemwise+1,"elemwise!"
#we fuse as many that we can at the same time to make debug mode faster #we fuse as many that we can at the same time to make debug mode faster
#debug mode will be faster as it won't test all intermediate step. #debug mode will be faster as it won't test all intermediate step.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论