Add ScalarOp.c_code_contiguous interface and use it with exp.

Do this for amdlibm. There is code for sigmoid, but it is disabled as it is slower.

Add ScalarOp.c_code_contiguous interface and use it with exp.
68aad6d2 · Frederic · d1dc2948 · 68aad6d2 · 68aad6d2 · 68aad6d2
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -845,10 +845,44 @@ class ScalarOp(Op):
    def c_code_cache_version(self):
        return (4,)

+    def c_code_contiguous(self, node, name, inp, out, sub):
+        """This function is called by Elemwise when all inputs and
+        outputs are c_contiguous. This allow to use SIMD version
+        of this op.
+
+        The inputs are the same as c_code EXCEPT that inp and out MUST
+        be the variable name of the ndarray, not the current element.
+
+        """
+        raise theano.gof.utils.MethodNotDefined()
+

 class UnaryScalarOp(ScalarOp):
    nin = 1
-
+    amd_float32 = None
+    amd_float64 = None
+
+    def c_code_contiguous(self, node, name, (x, ), (z, ), sub):
+        if (not theano.config.lib.amdlibm or
+            # We compare the dtype AND the broadcast flag
+            # as this function do not broadcast
+            node.inputs[0].type != node.outputs[0].type):
+            raise theano.gof.utils.MethodNotDefined()
+
+        if node.inputs[0].type == float32 and self.amd_float32 is not None:
+            dtype = 'float'
+            fct = self.amd_float32
+        elif node.inputs[0].type == float64 and self.amd_float64 is not None:
+            dtype = 'double'
+            fct = self.amd_float64
+        else:
+            raise theano.gof.utils.MethodNotDefined()
+        return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct)s(n, x, z);
+        """ % locals()

 class BinaryScalarOp(ScalarOp):
    # One may define in subclasses the following fields:
@@ -2100,6 +2134,9 @@ log1p = Log1p(upgrade_to_float, name='log1p')


 class Exp(UnaryScalarOp):
+    amd_float32 = "amd_vrsa_expf"
+    amd_float64 = "amd_vrda_exp"
+
    def impl(self, x):
        return numpy.exp(x)


--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1097,6 +1097,31 @@ class Elemwise(Op):
                dtypes=(idtypes + list(real_odtypes)),
                inner_task=code,
                sub=sub)
+
+        # If all inputs and outputs are contiguous
+        # and the scalar op define optimized code for that case
+        # use it!
+        if all([o.ndim >= 1 for o in node.outputs]):
+            try:
+                contig = self.scalar_op.c_code_contiguous(
+                    node,
+                    nodename + '_scalar_contig_',
+                    _inames,
+                    onames,
+                    sub)
+                # PyArray_ISONESEGMENT(arr)
+                #   return true if arr is fortran or c contiguous.
+                cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
+                                    for arr in _inames + onames])
+                loop = """
+            if(%(cond)s){
+                %(contig)s
+            }else{
+                %(loop)s
+            }
+            """ % locals()
+            except theano.gof.utils.MethodNotDefined:
+                pass
        return decl, checks, alloc, loop

    def c_code(self, node, nodename, inames, onames, sub):
@@ -1115,7 +1140,7 @@ class Elemwise(Op):
        return support_code

    def c_code_cache_version_apply(self, node):
-        version = [7]  # the version corresponding to the c code in this Op
+        version = [8]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -23,6 +23,9 @@ from theano.tensor import elemwise, opt, NotScalarConstantError
 #

 class ScalarSigmoid(scalar.UnaryScalarOp):
+    """
+    This is just speed opt. Not for stability.
+    """
    @staticmethod
    def st_impl(x):
        if x < -30.0:
@@ -64,6 +67,44 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
            return (2,) + v
        else:
            return v
+
+    # This fct is disabled as it is slower then the normal code!
+    def c_code_contiguous_disabled(self, node, name, inp, out, sub):
+        x, = inp
+        z, = out
+        if (not theano.config.lib.amdlibm or
+            node.inputs[0].dtype != node.outputs[0].dtype):
+            raise theano.gof.utils.MethodNotDefined()
+        dtype = node.inputs[0].dtype
+        if dtype == 'float32' and self.amd_float32 is not None:
+            dtype = 'float'
+            fct = "amd_vrsa_expf"
+        elif dtype == 'float64' and self.amd_float64 is not None:
+            dtype = 'double'
+            fct = "amd_vrda_exp"
+        else:
+            raise theano.gof.utils.MethodNotDefined()
+        return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        // We block to keep the data in l1
+        // normal l1 size = 32k: 32k/2(input + output)/8(nb bytes of double)=2k
+        // We stay bellow the 2k limit to let space for
+        // This is faster then the not blocking version
+        for(int i=0;i<n;i+=2048){
+            npy_intp nb = (n-i<2048)?n-i:2048;
+            for(int j=0;j<nb;j++){
+                z[i+j] = -x[i+j];
+            }
+            %(fct)s(nb, z+i, z+i);
+            for(int j=0;j<nb;j++){
+                z[i+j] = 1.0 /(1.0+z[i+j]);
+            }
+        }
+        """ % locals()
+        raise theano.gof.utils.MethodNotDefined()
+
 scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
 sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')