Merge pull request #1393 from nouiz/fcontig

Elemwise faster when c or f contig inputs

Merge pull request #1393 from nouiz/fcontig
dcb5e098 · lamblin · 0ff016bc · 65bdde40 · dcb5e098 · dcb5e098
--- a/doc/developer/index.txt
+++ b/doc/developer/index.txt
@@ -11,3 +11,4 @@ Theano Design and Implementation Documentation
   tensor
   scan
+   compat
--- a/doc/library/tensor/nnet/nnet.txt
+++ b/doc/library/tensor/nnet/nnet.txt
@@ -18,8 +18,8 @@
    :note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
        Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.
-          - hard_sigmoid: 1.1s
+          - hard_sigmoid: 1.0s
-          - ultra_fast_sigmoid: 1.4s
+          - ultra_fast_sigmoid: 1.3s
          - sigmoid (with amdlibm): 2.3s
          - sigmoid (without amdlibm): 3.7s

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1662,7 +1662,8 @@ class Pow(BinaryScalarOp):
        if (node.inputs[0].type == node.outputs[0].type and
            node.inputs[1].type == node.outputs[0].type and
            # amdlibm 3.0 do not have a float64 version of this SIMD function
-            node.inputs[0].dtype == 'float32'):
+            node.inputs[0].dtype == 'float32' and
+            node.inputs[1].dtype == 'float32'):
            dtype = 'float'
            fct = "amd_vrsa_powf"
            return """
@@ -1674,10 +1675,11 @@ class Pow(BinaryScalarOp):
        """ % locals()
        # We compare the dtype and check we broadcast a scalar
        elif (node.inputs[0].type == node.outputs[0].type and
-            node.inputs[1].dtype == node.outputs[0].dtype and
+              node.inputs[1].dtype == node.outputs[0].dtype and
-            all(node.inputs[1].broadcastable) and
+              all(node.inputs[1].broadcastable) and
-            # amdlibm 3.0 do not have a float64 version of this SIMD function
+              # amdlibm 3.0 do not have a float64 version of this SIMD function
-            node.inputs[0].dtype == 'float32'):
+              node.inputs[0].dtype == 'float32' and
+              node.inputs[1].dtype == 'float32'):
            dtype = 'float'
            fct = "amd_vrsa_powxf"
            return """

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1011,6 +1011,17 @@ class Elemwise(Op):
        decl = cgen.make_declare(orders, idtypes, sub)
        checks = cgen.make_checks(orders, idtypes, sub)
+        # Check if all inputs (except broadcasted scalar) are fortran.
+        # In that case, create an fortran output ndarray.
+        z = zip(inames, inputs)
+        alloc_fortran = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
+                                     for arr, var in z
+                                     if not all(var.broadcastable)])
+        # If it is a scalar, make it c contig to prevent problem with
+        # NumPy C and F contig not always set as both of them.
+        if len(alloc_fortran) == 0:
+            alloc_fortran = '0'
        alloc = ""
        # We loop over the "real" outputs, i.e., those that are not
        # inplace (must be allocated) and we declare/allocate/check
@@ -1022,7 +1033,8 @@ class Elemwise(Op):
            sub['olv'] = oname
            alloc += cgen.make_declare([range(nnested)], [odtype],
                                       dict(sub, lv0=oname))
-            alloc += cgen.make_alloc(orders, odtype, sub)
+            alloc += cgen.make_alloc(orders, odtype, sub,
+                                     fortran=alloc_fortran)
            alloc += cgen.make_checks([range(nnested)], [odtype],
                                      dict(sub, lv0=oname))
        olv_index = i  # index of the last output
@@ -1079,7 +1091,10 @@ class Elemwise(Op):
            %(undefs)s
        }
        """ % locals()
-        if all([o.ndim <= 1 for o in node.outputs]):
+        if all([o.ndim <= 1 for o in node.outputs] or
+               # Use simpler code when output ndim == 0 or 1
+               # or for broadcated scalar.
+               all(node.outputs[0].broadcastable)):
            if nnested:
                all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
            else:
@@ -1100,8 +1115,11 @@ class Elemwise(Op):
        # If all inputs and outputs are contiguous
        # and the scalar op define optimized code for that case
-        # use it!
+        # use it! The scalar_op need to check the broadcast flag himself.
-        if all([o.ndim >= 1 for o in node.outputs]):
+        if (all([o.ndim >= 1 for o in node.outputs]) and
+            # Don't use the contig code for broadcasted scalar.
+            not all(node.outputs[0].broadcastable)):
+            contig = None
            try:
                contig = self.scalar_op.c_code_contiguous(
                    node,
@@ -1109,19 +1127,54 @@ class Elemwise(Op):
                    _inames,
                    onames,
                    sub)
-                # PyArray_ISONESEGMENT(arr)
+            except theano.gof.utils.MethodNotDefined:
-                #   return true if arr is fortran or c contiguous.
+                # Try to make one generic version, this will help the
-                cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
+                # compiler to vectorize the code as their won't be as
-                                    for arr in _inames + onames])
+                # many ptr and the stride will be hard coded.
+                if all([io.broadcastable == node.outputs[0].broadcastable or
+                        all(io.broadcastable)
+                        for io in node.inputs + node.outputs]):
+                    z = onames[0]
+                    contig = """
+                    // All output have the same size
+                    npy_intp n = PyArray_SIZE(%(z)s);
+                    """ % locals()
+                    index = ""
+                    for x, var in zip(inames + onames,
+                                      inputs + node.outputs):
+                        if not all(var.broadcastable):
+                            contig += """
+            dtype_%(x)s * %(x)s_ptr = (dtype_%(x)s*) PyArray_DATA(%(x)s);
+                            """ % locals()
+                            index += """
+            dtype_%(x)s& %(x)s_i = %(x)s_ptr[i];
+                            """ % locals()
+                        else:
+                            contig += """
+            dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
+                            """ % locals()
+                    contig += """
+                    for(int i=0; i<n; i++){
+                        %(index)s
+                        %(task_code)s;
+                    }
+                    """ % locals()
+            if contig is not None:
+                z = zip(inames + onames, inputs + node.outputs)
+                cond1 = ' && '.join(["PyArray_ISCONTIGUOUS(%s)" % arr
+                                    for arr, var in z
+                                    if not all(var.broadcastable)])
+                cond2 = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
+                                    for arr, var in z
+                                    if not all(var.broadcastable)])
                loop = """
-            if(%(cond)s){
+            if((%(cond1)s) || (%(cond2)s)){
                %(contig)s
            }else{
                %(loop)s
            }
            """ % locals()
-            except theano.gof.utils.MethodNotDefined:
-                pass
        return decl, checks, alloc, loop
    def c_code(self, node, nodename, inames, onames, sub):
@@ -1140,7 +1193,7 @@ class Elemwise(Op):
        return support_code
    def c_code_cache_version_apply(self, node):
-        version = [8]  # the version corresponding to the c code in this Op
+        version = [11]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/tensor/elemwise_cgen.py
+++ b/theano/tensor/elemwise_cgen.py
@@ -113,9 +113,13 @@ def make_checks(loop_orders, dtypes, sub):
    return init % sub + check % sub
-def make_alloc(loop_orders, dtype, sub):
+def make_alloc(loop_orders, dtype, sub, fortran='0'):
-    """
+    """Generate C code to allocate outputs.
-    Generate C code to allocate outputs.
+    :param fortran: a string included in the generated code. If it
+        evaludate to non-zero, an ndarray in fortran order will be
+        created, otherwise it will be c order.
    """
    nd = len(loop_orders[0])
@@ -133,7 +137,6 @@ def make_alloc(loop_orders, dtype, sub):
                break
        else:
            init_dims += "dims[%(i)s] = 1;\n" % locals()
-            #raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
    # TODO: it would be interesting to allocate the output in such a
    # way that its contiguous dimensions match one of the input's
@@ -146,7 +149,9 @@ def make_alloc(loop_orders, dtype, sub):
        //npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp));
        %(init_dims)s
        if (!%(olv)s) {
-            %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0);
+            %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
+                                                    type_num_%(olv)s,
+                                                    %(fortran)s);
        }
        else {
            PyArray_Dims new_dims;