提交 413f34ac authored 作者: lamblin's avatar lamblin

Merge pull request #1389 from nouiz/sigmoid

Sigmoid
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
TODO: implement Images2Neibs.infer_shape() methods TODO: implement Images2Neibs.infer_shape() methods
""" """
import theano
from theano import Op, Apply from theano import Op, Apply
import theano.tensor as T import theano.tensor as T
from theano.gradient import grad_not_implemented from theano.gradient import grad_not_implemented
...@@ -111,6 +112,9 @@ class Images2Neibs(Op): ...@@ -111,6 +112,9 @@ class Images2Neibs(Op):
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
ten4, neib_shape, neib_step = inp ten4, neib_shape, neib_step = inp
z, = out_ z, = out_
# GpuImages2Neibs should not run this perform in DebugMode
if type(self) != Images2Neibs:
raise theano.gof.utils.MethodNotDefined()
def CEIL_INTDIV(a, b): def CEIL_INTDIV(a, b):
if a % b: if a % b:
......
...@@ -845,10 +845,48 @@ class ScalarOp(Op): ...@@ -845,10 +845,48 @@ class ScalarOp(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (4,)
def c_code_contiguous(self, node, name, inp, out, sub):
"""This function is called by Elemwise when all inputs and
outputs are c_contiguous. This allow to use SIMD version
of this op.
The inputs are the same as c_code except:
- inp and out must be the variable name of the ndarray
- node must be the elemwise node. This is needed to know
the inputs/outputs type.
"""
raise theano.gof.utils.MethodNotDefined()
class UnaryScalarOp(ScalarOp): class UnaryScalarOp(ScalarOp):
nin = 1 nin = 1
amd_float32 = None
amd_float64 = None
def c_code_contiguous(self, node, name, (x, ), (z, ), sub):
if (not theano.config.lib.amdlibm or
# We compare the dtype AND the broadcast flag
# as this function do not broadcast
node.inputs[0].type != node.outputs[0].type):
raise theano.gof.utils.MethodNotDefined()
dtype = node.inputs[0].dtype
if dtype == 'float32' and self.amd_float32 is not None:
dtype = 'float'
fct = self.amd_float32
elif dtype == 'float64' and self.amd_float64 is not None:
dtype = 'double'
fct = self.amd_float64
else:
raise theano.gof.utils.MethodNotDefined()
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, z);
""" % locals()
class BinaryScalarOp(ScalarOp): class BinaryScalarOp(ScalarOp):
# One may define in subclasses the following fields: # One may define in subclasses the following fields:
...@@ -1615,6 +1653,44 @@ class Pow(BinaryScalarOp): ...@@ -1615,6 +1653,44 @@ class Pow(BinaryScalarOp):
return (first_part, second_part) return (first_part, second_part)
def c_code_contiguous(self, node, name, (x, y), (z, ), sub):
if not theano.config.lib.amdlibm:
raise theano.gof.utils.MethodNotDefined()
# We compare the dtype AND the broadcast flag
# as this function do not broadcast
if (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].type == node.outputs[0].type and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
dtype = 'float'
fct = "amd_vrsa_powf"
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, y, z);
""" % locals()
# We compare the dtype and check we broadcast a scalar
elif (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].dtype == node.outputs[0].dtype and
all(node.inputs[1].broadcastable) and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
dtype = 'float'
fct = "amd_vrsa_powxf"
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, *y, z);
""" % locals()
raise theano.gof.utils.MethodNotDefined()
pow = Pow(upcast_out, name='pow') pow = Pow(upcast_out, name='pow')
...@@ -2019,6 +2095,9 @@ inv = Inv(upgrade_to_float, name='inv') ...@@ -2019,6 +2095,9 @@ inv = Inv(upgrade_to_float, name='inv')
class Log(UnaryScalarOp): class Log(UnaryScalarOp):
""" log base e """ """ log base e """
amd_float32 = "amd_vrsa_logf"
amd_float64 = "amd_vrda_log"
def impl(self, x): def impl(self, x):
return numpy.log(x) return numpy.log(x)
...@@ -2042,6 +2121,9 @@ log = Log(upgrade_to_float, name='log') ...@@ -2042,6 +2121,9 @@ log = Log(upgrade_to_float, name='log')
class Log2(UnaryScalarOp): class Log2(UnaryScalarOp):
""" log base 2 """ """ log base 2 """
amd_float32 = "amd_vrsa_log2f"
amd_float64 = "amd_vrda_log2"
def impl(self, x): def impl(self, x):
return numpy.log2(x) return numpy.log2(x)
...@@ -2062,6 +2144,9 @@ log2 = Log2(upgrade_to_float, name='log2') ...@@ -2062,6 +2144,9 @@ log2 = Log2(upgrade_to_float, name='log2')
class Log10(UnaryScalarOp): class Log10(UnaryScalarOp):
""" log base 10 """ """ log base 10 """
amd_float32 = "amd_vrsa_log10f"
amd_float64 = "amd_vrda_log10"
def impl(self, x): def impl(self, x):
return numpy.log10(x) return numpy.log10(x)
...@@ -2100,6 +2185,9 @@ log1p = Log1p(upgrade_to_float, name='log1p') ...@@ -2100,6 +2185,9 @@ log1p = Log1p(upgrade_to_float, name='log1p')
class Exp(UnaryScalarOp): class Exp(UnaryScalarOp):
amd_float32 = "amd_vrsa_expf"
amd_float64 = "amd_vrda_exp"
def impl(self, x): def impl(self, x):
return numpy.exp(x) return numpy.exp(x)
...@@ -2231,6 +2319,9 @@ rad2deg = Rad2Deg(upgrade_to_float, name='rad2deg') ...@@ -2231,6 +2319,9 @@ rad2deg = Rad2Deg(upgrade_to_float, name='rad2deg')
class Cos(UnaryScalarOp): class Cos(UnaryScalarOp):
amd_float32 = "amd_vrsa_cosf"
amd_float64 = "amd_vrda_cos"
def impl(self, x): def impl(self, x):
return numpy.cos(x) return numpy.cos(x)
...@@ -2269,6 +2360,9 @@ arccos = ArcCos(upgrade_to_float, name='arccos') ...@@ -2269,6 +2360,9 @@ arccos = ArcCos(upgrade_to_float, name='arccos')
class Sin(UnaryScalarOp): class Sin(UnaryScalarOp):
amd_float32 = "amd_vrsa_sinf"
amd_float64 = "amd_vrda_sin"
def impl(self, x): def impl(self, x):
return numpy.sin(x) return numpy.sin(x)
......
...@@ -1529,7 +1529,18 @@ class _tensor_py_operators: ...@@ -1529,7 +1529,18 @@ class _tensor_py_operators:
if self._is_nonzero: if self._is_nonzero:
return True return True
else: else:
raise TypeError("Variable does not support boolean operations.") raise TypeError(
"Variable does not support boolean operations. This"
"can happen if you do logical operator (<, <=, >, <=,"
"==, !=) between numpy.ndarray and theano tensor"
"variable. Due NumPy implementation before NumPy 1.8,"
"we can't make the python syntax work when the ndarray"
"is on the left, and this end with this error. To work"
"around that, just call"
"theano.tensor.{lt,le,eq,ne,gt,ge}(ndarray, tensor) or"
"use the python syntax with the theano tensor on the"
"left. Or update to NumPy 1.8 or above."
)
# BITWISE # BITWISE
def __invert__(self): def __invert__(self):
......
...@@ -1079,13 +1079,49 @@ class Elemwise(Op): ...@@ -1079,13 +1079,49 @@ class Elemwise(Op):
%(undefs)s %(undefs)s
} }
""" % locals() """ % locals()
if all([o.ndim <= 1 for o in node.outputs]):
if nnested:
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
else:
all_code = [code]
loop = cgen.make_reordered_loop( loop = cgen.make_loop(
loop_orders=orders + [range(nnested)] * len(real_onames),
dtypes=(idtypes + list(real_odtypes)),
loop_tasks=all_code,
sub=sub)
else:
loop = cgen.make_reordered_loop(
init_loop_orders=orders + [range(nnested)] * len(real_onames), init_loop_orders=orders + [range(nnested)] * len(real_onames),
olv_index=olv_index, olv_index=olv_index,
dtypes=(idtypes + list(real_odtypes)), dtypes=(idtypes + list(real_odtypes)),
inner_task=code, inner_task=code,
sub=sub) sub=sub)
# If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case
# use it!
if all([o.ndim >= 1 for o in node.outputs]):
try:
contig = self.scalar_op.c_code_contiguous(
node,
nodename + '_scalar_contig_',
_inames,
onames,
sub)
# PyArray_ISONESEGMENT(arr)
# return true if arr is fortran or c contiguous.
cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
for arr in _inames + onames])
loop = """
if(%(cond)s){
%(contig)s
}else{
%(loop)s
}
""" % locals()
except theano.gof.utils.MethodNotDefined:
pass
return decl, checks, alloc, loop return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub): def c_code(self, node, nodename, inames, onames, sub):
...@@ -1104,15 +1140,15 @@ class Elemwise(Op): ...@@ -1104,15 +1140,15 @@ class Elemwise(Op):
return support_code return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [6] # the version corresponding to the c code in this Op version = [8] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs], [Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs]) [Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node)) version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version): if all(version):
return tuple(version) return tuple(version)
else: else:
...@@ -1525,9 +1561,9 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){ ...@@ -1525,9 +1561,9 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs], [Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs]) [Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node)) version.append(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.append(Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version): if all(version):
return tuple(version) return tuple(version)
else: else:
......
...@@ -23,6 +23,9 @@ from theano.tensor import elemwise, opt, NotScalarConstantError ...@@ -23,6 +23,9 @@ from theano.tensor import elemwise, opt, NotScalarConstantError
# #
class ScalarSigmoid(scalar.UnaryScalarOp): class ScalarSigmoid(scalar.UnaryScalarOp):
"""
This is just speed opt. Not for stability.
"""
@staticmethod @staticmethod
def st_impl(x): def st_impl(x):
if x < -30.0: if x < -30.0:
...@@ -64,6 +67,44 @@ class ScalarSigmoid(scalar.UnaryScalarOp): ...@@ -64,6 +67,44 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
return (2,) + v return (2,) + v
else: else:
return v return v
# This fct is disabled as it is slower then the normal code!
def c_code_contiguous_disabled(self, node, name, inp, out, sub):
x, = inp
z, = out
if (not theano.config.lib.amdlibm or
node.inputs[0].dtype != node.outputs[0].dtype):
raise theano.gof.utils.MethodNotDefined()
dtype = node.inputs[0].dtype
if dtype == 'float32' and self.amd_float32 is not None:
dtype = 'float'
fct = "amd_vrsa_expf"
elif dtype == 'float64' and self.amd_float64 is not None:
dtype = 'double'
fct = "amd_vrda_exp"
else:
raise theano.gof.utils.MethodNotDefined()
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
// We block to keep the data in l1
// normal l1 size = 32k: 32k/2(input + output)/8(nb bytes of double)=2k
// We stay bellow the 2k limit to let space for
// This is faster then the not blocking version
for(int i=0;i<n;i+=2048){
npy_intp nb = (n-i<2048)?n-i:2048;
for(int j=0;j<nb;j++){
z[i+j] = -x[i+j];
}
%(fct)s(nb, z+i, z+i);
for(int j=0;j<nb;j++){
z[i+j] = 1.0 /(1.0+z[i+j]);
}
}
""" % locals()
raise theano.gof.utils.MethodNotDefined()
scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid') scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid') sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论