_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(i.owner.op.scalar_op))
do_fusion=False
ifdo_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
ifnb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
returnFalse
otype=node.outputs[0].type
# TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the
s_new_out=node.op.scalar_op(*s_g)
# data anyway... might as well sum it up while we're at it (this can be trickier than i'm
try:
# making it seound here. The data-traversal should be done contiguously, and the summing-up
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
["x"forxins_g],
# dimension)
"z",{})
exceptMethodNotDefined:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
exceptNotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
#create the composite op.
ifnotisinstance(node.op,OP):
C=scalar.Composite(s_inputs,[s_new_out])
returnFalse
nb_elemwise=0
#create the new node.
inputs=[]#inputs of the new Elemwise op.
n=T.Elemwise(C).make_node(*inputs)
s_inputs=[]#inputs of the new scalar op.
assertlen(n.outputs)==1
s_g=[]#graph of scalar, what will by done in the inner loop.
assertnode.outputs[0].dtype==n.outputs[0].dtype
foriinnode.inputs:
do_fusion=False
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
catch=False
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
_logger.info('loop fusion failed because Op would exceed kernel argument limit.')
exceptNotImplementedError:
catch=True
ifcatch:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(i.owner.op.scalar_op))
do_fusion=False
ifdo_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
ifnb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
exceptNotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
#create the composite op.
C=scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=OP(C).make_node(*inputs)
assertlen(n.outputs)==1
assertnode.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
ifTrue:
argument_limit=240# 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size=8
ptr_size=8
argument_size=int_size#for numels
argument_size+=int_size*inputs[0].type.ndim# for the shape