Elemwise output is now fortran when all inputs are fortran.

This speed up hard_sigmoid and ultra_fast_sigmoid by 2x. We ignore input that are broadcasted scalar when checking for fortran inputs.

Elemwise output is now fortran when all inputs are fortran.
ba38ab4b · Frederic · f982cbd7 · ba38ab4b · ba38ab4b · ba38ab4b
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -18,8 +18,8 @@
    :note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
        Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.

-          - hard_sigmoid: 1.1s
-          - ultra_fast_sigmoid: 1.4s
+          - hard_sigmoid: 1.0s
+          - ultra_fast_sigmoid: 1.3s
          - sigmoid (with amdlibm): 2.3s
          - sigmoid (without amdlibm): 3.7s


--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1011,6 +1011,17 @@ class Elemwise(Op):
        decl = cgen.make_declare(orders, idtypes, sub)
        checks = cgen.make_checks(orders, idtypes, sub)

+        # Check if all inputs (except broadcasted scalar) are fortran.
+        # In that case, create an fortran output ndarray.
+        z = zip(inames, inputs)
+        alloc_fortran = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
+                                     for arr, var in z
+                                     if not all(var.broadcastable)])
+        # If it is a scalar, make it c contig to prevent problem with
+        # NumPy C and F contig not always set as both of them.
+        if len(alloc_fortran) == 0:
+            alloc_fortran = '0'
+
        alloc = ""
        # We loop over the "real" outputs, i.e., those that are not
        # inplace (must be allocated) and we declare/allocate/check
@@ -1022,7 +1033,8 @@ class Elemwise(Op):
            sub['olv'] = oname
            alloc += cgen.make_declare([range(nnested)], [odtype],
                                       dict(sub, lv0=oname))
-            alloc += cgen.make_alloc(orders, odtype, sub)
+            alloc += cgen.make_alloc(orders, odtype, sub,
+                                     fortran=alloc_fortran)
            alloc += cgen.make_checks([range(nnested)], [odtype],
                                      dict(sub, lv0=oname))
        olv_index = i  # index of the last output
@@ -1176,7 +1188,7 @@ class Elemwise(Op):
        return support_code

    def c_code_cache_version_apply(self, node):
-        version = [9]  # the version corresponding to the c code in this Op
+        version = [10]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
@@ -113,9 +113,12 @@ def make_checks(loop_orders, dtypes, sub):
    return init % sub + check % sub


-def make_alloc(loop_orders, dtype, sub):
-    """
-    Generate C code to allocate outputs.
+def make_alloc(loop_orders, dtype, sub, fortran='0'):
+    """Generate C code to allocate outputs.
+
+    :param fortran: if non-zero, will create a ndarray in fortran
+        order.
+
    """

    nd = len(loop_orders[0])
@@ -133,7 +136,6 @@ def make_alloc(loop_orders, dtype, sub):
                break
        else:
            init_dims += "dims[%(i)s] = 1;\n" % locals()
-            #raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")

    # TODO: it would be interesting to allocate the output in such a
    # way that its contiguous dimensions match one of the input's
@@ -146,7 +148,9 @@ def make_alloc(loop_orders, dtype, sub):
        //npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp));
        %(init_dims)s
        if (!%(olv)s) {
-            %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0);
+            %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
+                                                    type_num_%(olv)s,
+                                                    %(fortran)s);
        }
        else {
            PyArray_Dims new_dims;