_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(i.owner.op.scalar_op))
do_fusion=False
ifdo_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
ifnb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
returnFalse
# TODO: use malloc and copy to transfer arguments that don't fit within the parameter space
# of 256 bytes
#
# TODO: Merge with multiple output to merge when an inputs have multiple clients. This can't be done with a local optimiser.
# TODO: Related: Support composites with multiple outputs
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
exceptNotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
# TODO: Use Composite to combine Elemwise and Reduce operations. We have to loop over the
# data anyway... might as well sum it up while we're at it (this can be trickier than i'm
# making it seound here. The data-traversal should be done contiguously, and the summing-up
# might not be easy or worthwhile if the summation axis doesn't line up with a contiguous
# dimension)
#create the composite op.
C=scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=T.Elemwise(C).make_node(*inputs)
assertlen(n.outputs)==1
assertnode.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
ifTrue:
argument_limit=240# 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size=8
ptr_size=8
argument_size=int_size#for numels
argument_size+=int_size*inputs[0].type.ndim# for the shape
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(i.owner.op.scalar_op))
do_fusion=False
ifdo_fusion:
nb_elemwise+=1
inputs.extend(i.owner.inputs)
s_inputs.extend(s_input)
s_g.append(s_op)
else:
inputs.append(i)
s=scalar.Scalar(i.dtype).make_variable()
s_inputs.append(s)
s_g.append(s)
#if no inputs have are an elemwise, there is nothing to fuse.
ifnb_elemwise==0:
# print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
exceptNotImplementedError:
_logger.info("%s does not implement the c_code function. As well as being potentially slow, this disables loop fusion of this op."%str(s_new_out.owner.op))
returnFalse
#create the composite op.
C=scalar.Composite(s_inputs,[s_new_out])
#create the new node.
n=OP(C).make_node(*inputs)
assertlen(n.outputs)==1
assertnode.outputs[0].dtype==n.outputs[0].dtype
# There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
# Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
ifTrue:
argument_limit=240# 16 bytes are used for block and thread coords etc.
#TODO: read in from architecture to make this 4 or 8
int_size=8
ptr_size=8
argument_size=int_size#for numels
argument_size+=int_size*inputs[0].type.ndim# for the shape