提交 dcb5e098 authored 作者: lamblin's avatar lamblin

Merge pull request #1393 from nouiz/fcontig

Elemwise faster when c or f contig inputs
......@@ -11,3 +11,4 @@ Theano Design and Implementation Documentation
tensor
scan
compat
......@@ -18,8 +18,8 @@
:note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.
- hard_sigmoid: 1.1s
- ultra_fast_sigmoid: 1.4s
- hard_sigmoid: 1.0s
- ultra_fast_sigmoid: 1.3s
- sigmoid (with amdlibm): 2.3s
- sigmoid (without amdlibm): 3.7s
......
......@@ -1662,7 +1662,8 @@ class Pow(BinaryScalarOp):
if (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].type == node.outputs[0].type and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
node.inputs[0].dtype == 'float32' and
node.inputs[1].dtype == 'float32'):
dtype = 'float'
fct = "amd_vrsa_powf"
return """
......@@ -1677,7 +1678,8 @@ class Pow(BinaryScalarOp):
node.inputs[1].dtype == node.outputs[0].dtype and
all(node.inputs[1].broadcastable) and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
node.inputs[0].dtype == 'float32' and
node.inputs[1].dtype == 'float32'):
dtype = 'float'
fct = "amd_vrsa_powxf"
return """
......
......@@ -1011,6 +1011,17 @@ class Elemwise(Op):
decl = cgen.make_declare(orders, idtypes, sub)
checks = cgen.make_checks(orders, idtypes, sub)
# Check if all inputs (except broadcasted scalar) are fortran.
# In that case, create an fortran output ndarray.
z = zip(inames, inputs)
alloc_fortran = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
# If it is a scalar, make it c contig to prevent problem with
# NumPy C and F contig not always set as both of them.
if len(alloc_fortran) == 0:
alloc_fortran = '0'
alloc = ""
# We loop over the "real" outputs, i.e., those that are not
# inplace (must be allocated) and we declare/allocate/check
......@@ -1022,7 +1033,8 @@ class Elemwise(Op):
sub['olv'] = oname
alloc += cgen.make_declare([range(nnested)], [odtype],
dict(sub, lv0=oname))
alloc += cgen.make_alloc(orders, odtype, sub)
alloc += cgen.make_alloc(orders, odtype, sub,
fortran=alloc_fortran)
alloc += cgen.make_checks([range(nnested)], [odtype],
dict(sub, lv0=oname))
olv_index = i # index of the last output
......@@ -1079,7 +1091,10 @@ class Elemwise(Op):
%(undefs)s
}
""" % locals()
if all([o.ndim <= 1 for o in node.outputs]):
if all([o.ndim <= 1 for o in node.outputs] or
# Use simpler code when output ndim == 0 or 1
# or for broadcated scalar.
all(node.outputs[0].broadcastable)):
if nnested:
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
else:
......@@ -1100,8 +1115,11 @@ class Elemwise(Op):
# If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case
# use it!
if all([o.ndim >= 1 for o in node.outputs]):
# use it! The scalar_op need to check the broadcast flag himself.
if (all([o.ndim >= 1 for o in node.outputs]) and
# Don't use the contig code for broadcasted scalar.
not all(node.outputs[0].broadcastable)):
contig = None
try:
contig = self.scalar_op.c_code_contiguous(
node,
......@@ -1109,19 +1127,54 @@ class Elemwise(Op):
_inames,
onames,
sub)
# PyArray_ISONESEGMENT(arr)
# return true if arr is fortran or c contiguous.
cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
for arr in _inames + onames])
except theano.gof.utils.MethodNotDefined:
# Try to make one generic version, this will help the
# compiler to vectorize the code as their won't be as
# many ptr and the stride will be hard coded.
if all([io.broadcastable == node.outputs[0].broadcastable or
all(io.broadcastable)
for io in node.inputs + node.outputs]):
z = onames[0]
contig = """
// All output have the same size
npy_intp n = PyArray_SIZE(%(z)s);
""" % locals()
index = ""
for x, var in zip(inames + onames,
inputs + node.outputs):
if not all(var.broadcastable):
contig += """
dtype_%(x)s * %(x)s_ptr = (dtype_%(x)s*) PyArray_DATA(%(x)s);
""" % locals()
index += """
dtype_%(x)s& %(x)s_i = %(x)s_ptr[i];
""" % locals()
else:
contig += """
dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
""" % locals()
contig += """
for(int i=0; i<n; i++){
%(index)s
%(task_code)s;
}
""" % locals()
if contig is not None:
z = zip(inames + onames, inputs + node.outputs)
cond1 = ' && '.join(["PyArray_ISCONTIGUOUS(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
cond2 = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
loop = """
if(%(cond)s){
if((%(cond1)s) || (%(cond2)s)){
%(contig)s
}else{
%(loop)s
}
""" % locals()
except theano.gof.utils.MethodNotDefined:
pass
return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub):
......@@ -1140,7 +1193,7 @@ class Elemwise(Op):
return support_code
def c_code_cache_version_apply(self, node):
version = [8] # the version corresponding to the c code in this Op
version = [11] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......
......@@ -113,9 +113,13 @@ def make_checks(loop_orders, dtypes, sub):
return init % sub + check % sub
def make_alloc(loop_orders, dtype, sub):
"""
Generate C code to allocate outputs.
def make_alloc(loop_orders, dtype, sub, fortran='0'):
"""Generate C code to allocate outputs.
:param fortran: a string included in the generated code. If it
evaludate to non-zero, an ndarray in fortran order will be
created, otherwise it will be c order.
"""
nd = len(loop_orders[0])
......@@ -133,7 +137,6 @@ def make_alloc(loop_orders, dtype, sub):
break
else:
init_dims += "dims[%(i)s] = 1;\n" % locals()
#raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
# TODO: it would be interesting to allocate the output in such a
# way that its contiguous dimensions match one of the input's
......@@ -146,7 +149,9 @@ def make_alloc(loop_orders, dtype, sub):
//npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp));
%(init_dims)s
if (!%(olv)s) {
%(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0);
%(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
type_num_%(olv)s,
%(fortran)s);
}
else {
PyArray_Dims new_dims;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论