提交 ba38ab4b authored 作者: Frederic's avatar Frederic

Elemwise output is now fortran when all inputs are fortran.

This speed up hard_sigmoid and ultra_fast_sigmoid by 2x. We ignore input that are broadcasted scalar when checking for fortran inputs.
上级 f982cbd7
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
:note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version. :note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz. Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.
- hard_sigmoid: 1.1s - hard_sigmoid: 1.0s
- ultra_fast_sigmoid: 1.4s - ultra_fast_sigmoid: 1.3s
- sigmoid (with amdlibm): 2.3s - sigmoid (with amdlibm): 2.3s
- sigmoid (without amdlibm): 3.7s - sigmoid (without amdlibm): 3.7s
......
...@@ -1011,6 +1011,17 @@ class Elemwise(Op): ...@@ -1011,6 +1011,17 @@ class Elemwise(Op):
decl = cgen.make_declare(orders, idtypes, sub) decl = cgen.make_declare(orders, idtypes, sub)
checks = cgen.make_checks(orders, idtypes, sub) checks = cgen.make_checks(orders, idtypes, sub)
# Check if all inputs (except broadcasted scalar) are fortran.
# In that case, create an fortran output ndarray.
z = zip(inames, inputs)
alloc_fortran = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
# If it is a scalar, make it c contig to prevent problem with
# NumPy C and F contig not always set as both of them.
if len(alloc_fortran) == 0:
alloc_fortran = '0'
alloc = "" alloc = ""
# We loop over the "real" outputs, i.e., those that are not # We loop over the "real" outputs, i.e., those that are not
# inplace (must be allocated) and we declare/allocate/check # inplace (must be allocated) and we declare/allocate/check
...@@ -1022,7 +1033,8 @@ class Elemwise(Op): ...@@ -1022,7 +1033,8 @@ class Elemwise(Op):
sub['olv'] = oname sub['olv'] = oname
alloc += cgen.make_declare([range(nnested)], [odtype], alloc += cgen.make_declare([range(nnested)], [odtype],
dict(sub, lv0=oname)) dict(sub, lv0=oname))
alloc += cgen.make_alloc(orders, odtype, sub) alloc += cgen.make_alloc(orders, odtype, sub,
fortran=alloc_fortran)
alloc += cgen.make_checks([range(nnested)], [odtype], alloc += cgen.make_checks([range(nnested)], [odtype],
dict(sub, lv0=oname)) dict(sub, lv0=oname))
olv_index = i # index of the last output olv_index = i # index of the last output
...@@ -1176,7 +1188,7 @@ class Elemwise(Op): ...@@ -1176,7 +1188,7 @@ class Elemwise(Op):
return support_code return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [9] # the version corresponding to the c code in this Op version = [10] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
...@@ -113,9 +113,12 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -113,9 +113,12 @@ def make_checks(loop_orders, dtypes, sub):
return init % sub + check % sub return init % sub + check % sub
def make_alloc(loop_orders, dtype, sub): def make_alloc(loop_orders, dtype, sub, fortran='0'):
""" """Generate C code to allocate outputs.
Generate C code to allocate outputs.
:param fortran: if non-zero, will create a ndarray in fortran
order.
""" """
nd = len(loop_orders[0]) nd = len(loop_orders[0])
...@@ -133,7 +136,6 @@ def make_alloc(loop_orders, dtype, sub): ...@@ -133,7 +136,6 @@ def make_alloc(loop_orders, dtype, sub):
break break
else: else:
init_dims += "dims[%(i)s] = 1;\n" % locals() init_dims += "dims[%(i)s] = 1;\n" % locals()
#raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
# TODO: it would be interesting to allocate the output in such a # TODO: it would be interesting to allocate the output in such a
# way that its contiguous dimensions match one of the input's # way that its contiguous dimensions match one of the input's
...@@ -146,7 +148,9 @@ def make_alloc(loop_orders, dtype, sub): ...@@ -146,7 +148,9 @@ def make_alloc(loop_orders, dtype, sub):
//npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp)); //npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp));
%(init_dims)s %(init_dims)s
if (!%(olv)s) { if (!%(olv)s) {
%(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0); %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
type_num_%(olv)s,
%(fortran)s);
} }
else { else {
PyArray_Dims new_dims; PyArray_Dims new_dims;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论