Merge pull request #1389 from nouiz/sigmoid

Sigmoid

Merge pull request #1389 from nouiz/sigmoid
413f34ac · lamblin · ce660543 · 2a54b6c3 · 413f34ac · 413f34ac
--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -2,6 +2,7 @@
 TODO: implement Images2Neibs.infer_shape() methods

 """
+import theano
 from theano import Op, Apply
 import theano.tensor as T
 from theano.gradient import grad_not_implemented
@@ -111,6 +112,9 @@ class Images2Neibs(Op):
    def perform(self, node, inp, out_):
        ten4, neib_shape, neib_step = inp
        z, = out_
+        # GpuImages2Neibs should not run this perform in DebugMode
+        if type(self) != Images2Neibs:
+            raise theano.gof.utils.MethodNotDefined()

        def CEIL_INTDIV(a, b):
            if a % b:

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -845,10 +845,48 @@ class ScalarOp(Op):
    def c_code_cache_version(self):
        return (4,)

+    def c_code_contiguous(self, node, name, inp, out, sub):
+        """This function is called by Elemwise when all inputs and
+        outputs are c_contiguous. This allow to use SIMD version
+        of this op.
+
+        The inputs are the same as c_code except:
+
+            - inp and out must be the variable name of the ndarray
+            - node must be the elemwise node. This is needed to know
+              the inputs/outputs type.
+
+        """
+        raise theano.gof.utils.MethodNotDefined()
+

 class UnaryScalarOp(ScalarOp):
    nin = 1
-
+    amd_float32 = None
+    amd_float64 = None
+
+    def c_code_contiguous(self, node, name, (x, ), (z, ), sub):
+        if (not theano.config.lib.amdlibm or
+            # We compare the dtype AND the broadcast flag
+            # as this function do not broadcast
+            node.inputs[0].type != node.outputs[0].type):
+            raise theano.gof.utils.MethodNotDefined()
+
+        dtype = node.inputs[0].dtype
+        if dtype == 'float32' and self.amd_float32 is not None:
+            dtype = 'float'
+            fct = self.amd_float32
+        elif dtype == 'float64' and self.amd_float64 is not None:
+            dtype = 'double'
+            fct = self.amd_float64
+        else:
+            raise theano.gof.utils.MethodNotDefined()
+        return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct)s(n, x, z);
+        """ % locals()

 class BinaryScalarOp(ScalarOp):
    # One may define in subclasses the following fields:
@@ -1615,6 +1653,44 @@ class Pow(BinaryScalarOp):

        return (first_part, second_part)

+    def c_code_contiguous(self, node, name, (x, y), (z, ), sub):
+        if not theano.config.lib.amdlibm:
+            raise theano.gof.utils.MethodNotDefined()
+
+        # We compare the dtype AND the broadcast flag
+        # as this function do not broadcast
+        if (node.inputs[0].type == node.outputs[0].type and
+            node.inputs[1].type == node.outputs[0].type and
+            # amdlibm 3.0 do not have a float64 version of this SIMD function
+            node.inputs[0].dtype == 'float32'):
+            dtype = 'float'
+            fct = "amd_vrsa_powf"
+            return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct)s(n, x, y, z);
+        """ % locals()
+        # We compare the dtype and check we broadcast a scalar
+        elif (node.inputs[0].type == node.outputs[0].type and
+            node.inputs[1].dtype == node.outputs[0].dtype and
+            all(node.inputs[1].broadcastable) and
+            # amdlibm 3.0 do not have a float64 version of this SIMD function
+            node.inputs[0].dtype == 'float32'):
+            dtype = 'float'
+            fct = "amd_vrsa_powxf"
+            return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        %(fct)s(n, x, *y, z);
+        """ % locals()
+
+        raise theano.gof.utils.MethodNotDefined()
+
+
 pow = Pow(upcast_out, name='pow')


@@ -2019,6 +2095,9 @@ inv = Inv(upgrade_to_float, name='inv')

 class Log(UnaryScalarOp):
    """ log base e """
+    amd_float32 = "amd_vrsa_logf"
+    amd_float64 = "amd_vrda_log"
+
    def impl(self, x):
        return numpy.log(x)

@@ -2042,6 +2121,9 @@ log = Log(upgrade_to_float, name='log')

 class Log2(UnaryScalarOp):
    """ log base 2 """
+    amd_float32 = "amd_vrsa_log2f"
+    amd_float64 = "amd_vrda_log2"
+
    def impl(self, x):
        return numpy.log2(x)

@@ -2062,6 +2144,9 @@ log2 = Log2(upgrade_to_float, name='log2')

 class Log10(UnaryScalarOp):
    """ log base 10 """
+    amd_float32 = "amd_vrsa_log10f"
+    amd_float64 = "amd_vrda_log10"
+
    def impl(self, x):
        return numpy.log10(x)

@@ -2100,6 +2185,9 @@ log1p = Log1p(upgrade_to_float, name='log1p')


 class Exp(UnaryScalarOp):
+    amd_float32 = "amd_vrsa_expf"
+    amd_float64 = "amd_vrda_exp"
+
    def impl(self, x):
        return numpy.exp(x)

@@ -2231,6 +2319,9 @@ rad2deg = Rad2Deg(upgrade_to_float, name='rad2deg')


 class Cos(UnaryScalarOp):
+    amd_float32 = "amd_vrsa_cosf"
+    amd_float64 = "amd_vrda_cos"
+
    def impl(self, x):
        return numpy.cos(x)

@@ -2269,6 +2360,9 @@ arccos = ArcCos(upgrade_to_float, name='arccos')


 class Sin(UnaryScalarOp):
+    amd_float32 = "amd_vrsa_sinf"
+    amd_float64 = "amd_vrda_sin"
+
    def impl(self, x):
        return numpy.sin(x)


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1529,7 +1529,18 @@ class _tensor_py_operators:
        if self._is_nonzero:
            return True
        else:
-            raise TypeError("Variable does not support boolean operations.")
+            raise TypeError(
+                "Variable does not support boolean operations. This"
+                "can happen if you do logical operator (<, <=, >, <=,"
+                "==, !=) between numpy.ndarray and theano tensor"
+                "variable. Due NumPy implementation before NumPy 1.8,"
+                "we can't make the python syntax work when the ndarray"
+                "is on the left, and this end with this error. To work"
+                "around that, just call"
+                "theano.tensor.{lt,le,eq,ne,gt,ge}(ndarray, tensor) or"
+                "use the python syntax with the theano tensor on the"
+                "left. Or update to NumPy 1.8 or above."
+            )

    # BITWISE
    def __invert__(self):

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1079,13 +1079,49 @@ class Elemwise(Op):
            %(undefs)s
        }
        """ % locals()
+        if all([o.ndim <= 1 for o in node.outputs]):
+            if nnested:
+                all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
+            else:
+                all_code = [code]

-        loop = cgen.make_reordered_loop(
+            loop = cgen.make_loop(
+                loop_orders=orders + [range(nnested)] * len(real_onames),
+                dtypes=(idtypes + list(real_odtypes)),
+                loop_tasks=all_code,
+                sub=sub)
+        else:
+            loop = cgen.make_reordered_loop(
                init_loop_orders=orders + [range(nnested)] * len(real_onames),
                olv_index=olv_index,
                dtypes=(idtypes + list(real_odtypes)),
                inner_task=code,
                sub=sub)
+
+        # If all inputs and outputs are contiguous
+        # and the scalar op define optimized code for that case
+        # use it!
+        if all([o.ndim >= 1 for o in node.outputs]):
+            try:
+                contig = self.scalar_op.c_code_contiguous(
+                    node,
+                    nodename + '_scalar_contig_',
+                    _inames,
+                    onames,
+                    sub)
+                # PyArray_ISONESEGMENT(arr)
+                #   return true if arr is fortran or c contiguous.
+                cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
+                                    for arr in _inames + onames])
+                loop = """
+            if(%(cond)s){
+                %(contig)s
+            }else{
+                %(loop)s
+            }
+            """ % locals()
+            except theano.gof.utils.MethodNotDefined:
+                pass
        return decl, checks, alloc, loop

    def c_code(self, node, nodename, inames, onames, sub):
@@ -1104,15 +1140,15 @@ class Elemwise(Op):
        return support_code

    def c_code_cache_version_apply(self, node):
-        version = [6]  # the version corresponding to the c code in this Op
+        version = [8]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
-        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
+        version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
-            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+            version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
        if all(version):
            return tuple(version)
        else:
@@ -1525,9 +1561,9 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
        scalar_node = Apply(self.scalar_op,
                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
-        version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
+        version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
        for i in node.inputs + node.outputs:
-            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+            version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
        if all(version):
            return tuple(version)
        else:

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -23,6 +23,9 @@ from theano.tensor import elemwise, opt, NotScalarConstantError
 #

 class ScalarSigmoid(scalar.UnaryScalarOp):
+    """
+    This is just speed opt. Not for stability.
+    """
    @staticmethod
    def st_impl(x):
        if x < -30.0:
@@ -64,6 +67,44 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
            return (2,) + v
        else:
            return v
+
+    # This fct is disabled as it is slower then the normal code!
+    def c_code_contiguous_disabled(self, node, name, inp, out, sub):
+        x, = inp
+        z, = out
+        if (not theano.config.lib.amdlibm or
+            node.inputs[0].dtype != node.outputs[0].dtype):
+            raise theano.gof.utils.MethodNotDefined()
+        dtype = node.inputs[0].dtype
+        if dtype == 'float32' and self.amd_float32 is not None:
+            dtype = 'float'
+            fct = "amd_vrsa_expf"
+        elif dtype == 'float64' and self.amd_float64 is not None:
+            dtype = 'double'
+            fct = "amd_vrda_exp"
+        else:
+            raise theano.gof.utils.MethodNotDefined()
+        return """
+        npy_intp n = PyArray_SIZE(%(z)s);
+        %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
+        %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
+        // We block to keep the data in l1
+        // normal l1 size = 32k: 32k/2(input + output)/8(nb bytes of double)=2k
+        // We stay bellow the 2k limit to let space for
+        // This is faster then the not blocking version
+        for(int i=0;i<n;i+=2048){
+            npy_intp nb = (n-i<2048)?n-i:2048;
+            for(int j=0;j<nb;j++){
+                z[i+j] = -x[i+j];
+            }
+            %(fct)s(nb, z+i, z+i);
+            for(int j=0;j<nb;j++){
+                z[i+j] = 1.0 /(1.0+z[i+j]);
+            }
+        }
+        """ % locals()
+        raise theano.gof.utils.MethodNotDefined()
+
 scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
 sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')