提交 68aad6d2 authored 作者: Frederic's avatar Frederic

Add ScalarOp.c_code_contiguous interface and use it with exp.

Do this for amdlibm. There is code for sigmoid, but it is disabled as it is slower.
上级 d1dc2948
......@@ -845,10 +845,44 @@ class ScalarOp(Op):
def c_code_cache_version(self):
return (4,)
def c_code_contiguous(self, node, name, inp, out, sub):
"""This function is called by Elemwise when all inputs and
outputs are c_contiguous. This allow to use SIMD version
of this op.
The inputs are the same as c_code EXCEPT that inp and out MUST
be the variable name of the ndarray, not the current element.
"""
raise theano.gof.utils.MethodNotDefined()
class UnaryScalarOp(ScalarOp):
nin = 1
amd_float32 = None
amd_float64 = None
def c_code_contiguous(self, node, name, (x, ), (z, ), sub):
if (not theano.config.lib.amdlibm or
# We compare the dtype AND the broadcast flag
# as this function do not broadcast
node.inputs[0].type != node.outputs[0].type):
raise theano.gof.utils.MethodNotDefined()
if node.inputs[0].type == float32 and self.amd_float32 is not None:
dtype = 'float'
fct = self.amd_float32
elif node.inputs[0].type == float64 and self.amd_float64 is not None:
dtype = 'double'
fct = self.amd_float64
else:
raise theano.gof.utils.MethodNotDefined()
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, z);
""" % locals()
class BinaryScalarOp(ScalarOp):
# One may define in subclasses the following fields:
......@@ -2100,6 +2134,9 @@ log1p = Log1p(upgrade_to_float, name='log1p')
class Exp(UnaryScalarOp):
amd_float32 = "amd_vrsa_expf"
amd_float64 = "amd_vrda_exp"
def impl(self, x):
return numpy.exp(x)
......
......@@ -1097,6 +1097,31 @@ class Elemwise(Op):
dtypes=(idtypes + list(real_odtypes)),
inner_task=code,
sub=sub)
# If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case
# use it!
if all([o.ndim >= 1 for o in node.outputs]):
try:
contig = self.scalar_op.c_code_contiguous(
node,
nodename + '_scalar_contig_',
_inames,
onames,
sub)
# PyArray_ISONESEGMENT(arr)
# return true if arr is fortran or c contiguous.
cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr
for arr in _inames + onames])
loop = """
if(%(cond)s){
%(contig)s
}else{
%(loop)s
}
""" % locals()
except theano.gof.utils.MethodNotDefined:
pass
return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub):
......@@ -1115,7 +1140,7 @@ class Elemwise(Op):
return support_code
def c_code_cache_version_apply(self, node):
version = [7] # the version corresponding to the c code in this Op
version = [8] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
......
......@@ -23,6 +23,9 @@ from theano.tensor import elemwise, opt, NotScalarConstantError
#
class ScalarSigmoid(scalar.UnaryScalarOp):
"""
This is just speed opt. Not for stability.
"""
@staticmethod
def st_impl(x):
if x < -30.0:
......@@ -64,6 +67,44 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
return (2,) + v
else:
return v
# This fct is disabled as it is slower then the normal code!
def c_code_contiguous_disabled(self, node, name, inp, out, sub):
x, = inp
z, = out
if (not theano.config.lib.amdlibm or
node.inputs[0].dtype != node.outputs[0].dtype):
raise theano.gof.utils.MethodNotDefined()
dtype = node.inputs[0].dtype
if dtype == 'float32' and self.amd_float32 is not None:
dtype = 'float'
fct = "amd_vrsa_expf"
elif dtype == 'float64' and self.amd_float64 is not None:
dtype = 'double'
fct = "amd_vrda_exp"
else:
raise theano.gof.utils.MethodNotDefined()
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
// We block to keep the data in l1
// normal l1 size = 32k: 32k/2(input + output)/8(nb bytes of double)=2k
// We stay bellow the 2k limit to let space for
// This is faster then the not blocking version
for(int i=0;i<n;i+=2048){
npy_intp nb = (n-i<2048)?n-i:2048;
for(int j=0;j<nb;j++){
z[i+j] = -x[i+j];
}
%(fct)s(nb, z+i, z+i);
for(int j=0;j<nb;j++){
z[i+j] = 1.0 /(1.0+z[i+j]);
}
}
""" % locals()
raise theano.gof.utils.MethodNotDefined()
scalar_sigmoid = ScalarSigmoid(scalar.upgrade_to_float, name='scalar_sigmoid')
sigmoid = elemwise.Elemwise(scalar_sigmoid, name='sigmoid')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论