提交 dcb5e098 authored 作者: lamblin's avatar lamblin

Merge pull request #1393 from nouiz/fcontig

Elemwise faster when c or f contig inputs
...@@ -11,3 +11,4 @@ Theano Design and Implementation Documentation ...@@ -11,3 +11,4 @@ Theano Design and Implementation Documentation
tensor tensor
scan scan
compat
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
:note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version. :note: see :func:`ultra_fast_sigmoid` or :func:`hard_sigmoid` for faster version.
Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz. Speed comparison for 100M float64 element on a Core2 Duo @ 3.16 GHz.
- hard_sigmoid: 1.1s - hard_sigmoid: 1.0s
- ultra_fast_sigmoid: 1.4s - ultra_fast_sigmoid: 1.3s
- sigmoid (with amdlibm): 2.3s - sigmoid (with amdlibm): 2.3s
- sigmoid (without amdlibm): 3.7s - sigmoid (without amdlibm): 3.7s
......
...@@ -1662,7 +1662,8 @@ class Pow(BinaryScalarOp): ...@@ -1662,7 +1662,8 @@ class Pow(BinaryScalarOp):
if (node.inputs[0].type == node.outputs[0].type and if (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].type == node.outputs[0].type and node.inputs[1].type == node.outputs[0].type and
# amdlibm 3.0 do not have a float64 version of this SIMD function # amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'): node.inputs[0].dtype == 'float32' and
node.inputs[1].dtype == 'float32'):
dtype = 'float' dtype = 'float'
fct = "amd_vrsa_powf" fct = "amd_vrsa_powf"
return """ return """
...@@ -1674,10 +1675,11 @@ class Pow(BinaryScalarOp): ...@@ -1674,10 +1675,11 @@ class Pow(BinaryScalarOp):
""" % locals() """ % locals()
# We compare the dtype and check we broadcast a scalar # We compare the dtype and check we broadcast a scalar
elif (node.inputs[0].type == node.outputs[0].type and elif (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].dtype == node.outputs[0].dtype and node.inputs[1].dtype == node.outputs[0].dtype and
all(node.inputs[1].broadcastable) and all(node.inputs[1].broadcastable) and
# amdlibm 3.0 do not have a float64 version of this SIMD function # amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'): node.inputs[0].dtype == 'float32' and
node.inputs[1].dtype == 'float32'):
dtype = 'float' dtype = 'float'
fct = "amd_vrsa_powxf" fct = "amd_vrsa_powxf"
return """ return """
......
...@@ -1011,6 +1011,17 @@ class Elemwise(Op): ...@@ -1011,6 +1011,17 @@ class Elemwise(Op):
decl = cgen.make_declare(orders, idtypes, sub) decl = cgen.make_declare(orders, idtypes, sub)
checks = cgen.make_checks(orders, idtypes, sub) checks = cgen.make_checks(orders, idtypes, sub)
# Check if all inputs (except broadcasted scalar) are fortran.
# In that case, create an fortran output ndarray.
z = zip(inames, inputs)
alloc_fortran = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
# If it is a scalar, make it c contig to prevent problem with
# NumPy C and F contig not always set as both of them.
if len(alloc_fortran) == 0:
alloc_fortran = '0'
alloc = "" alloc = ""
# We loop over the "real" outputs, i.e., those that are not # We loop over the "real" outputs, i.e., those that are not
# inplace (must be allocated) and we declare/allocate/check # inplace (must be allocated) and we declare/allocate/check
...@@ -1022,7 +1033,8 @@ class Elemwise(Op): ...@@ -1022,7 +1033,8 @@ class Elemwise(Op):
sub['olv'] = oname sub['olv'] = oname
alloc += cgen.make_declare([range(nnested)], [odtype], alloc += cgen.make_declare([range(nnested)], [odtype],
dict(sub, lv0=oname)) dict(sub, lv0=oname))
alloc += cgen.make_alloc(orders, odtype, sub) alloc += cgen.make_alloc(orders, odtype, sub,
fortran=alloc_fortran)
alloc += cgen.make_checks([range(nnested)], [odtype], alloc += cgen.make_checks([range(nnested)], [odtype],
dict(sub, lv0=oname)) dict(sub, lv0=oname))
olv_index = i # index of the last output olv_index = i # index of the last output
...@@ -1079,7 +1091,10 @@ class Elemwise(Op): ...@@ -1079,7 +1091,10 @@ class Elemwise(Op):
%(undefs)s %(undefs)s
} }
""" % locals() """ % locals()
if all([o.ndim <= 1 for o in node.outputs]): if all([o.ndim <= 1 for o in node.outputs] or
# Use simpler code when output ndim == 0 or 1
# or for broadcated scalar.
all(node.outputs[0].broadcastable)):
if nnested: if nnested:
all_code = [("", "")] * (nnested - 1) + [("", code)] + [""] all_code = [("", "")] * (nnested - 1) + [("", code)] + [""]
else: else:
...@@ -1100,8 +1115,11 @@ class Elemwise(Op): ...@@ -1100,8 +1115,11 @@ class Elemwise(Op):
# If all inputs and outputs are contiguous # If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case # and the scalar op define optimized code for that case
# use it! # use it! The scalar_op need to check the broadcast flag himself.
if all([o.ndim >= 1 for o in node.outputs]): if (all([o.ndim >= 1 for o in node.outputs]) and
# Don't use the contig code for broadcasted scalar.
not all(node.outputs[0].broadcastable)):
contig = None
try: try:
contig = self.scalar_op.c_code_contiguous( contig = self.scalar_op.c_code_contiguous(
node, node,
...@@ -1109,19 +1127,54 @@ class Elemwise(Op): ...@@ -1109,19 +1127,54 @@ class Elemwise(Op):
_inames, _inames,
onames, onames,
sub) sub)
# PyArray_ISONESEGMENT(arr) except theano.gof.utils.MethodNotDefined:
# return true if arr is fortran or c contiguous. # Try to make one generic version, this will help the
cond = ' && '.join(["PyArray_ISONESEGMENT(%s)" % arr # compiler to vectorize the code as their won't be as
for arr in _inames + onames]) # many ptr and the stride will be hard coded.
if all([io.broadcastable == node.outputs[0].broadcastable or
all(io.broadcastable)
for io in node.inputs + node.outputs]):
z = onames[0]
contig = """
// All output have the same size
npy_intp n = PyArray_SIZE(%(z)s);
""" % locals()
index = ""
for x, var in zip(inames + onames,
inputs + node.outputs):
if not all(var.broadcastable):
contig += """
dtype_%(x)s * %(x)s_ptr = (dtype_%(x)s*) PyArray_DATA(%(x)s);
""" % locals()
index += """
dtype_%(x)s& %(x)s_i = %(x)s_ptr[i];
""" % locals()
else:
contig += """
dtype_%(x)s& %(x)s_i = ((dtype_%(x)s*) PyArray_DATA(%(x)s))[0];
""" % locals()
contig += """
for(int i=0; i<n; i++){
%(index)s
%(task_code)s;
}
""" % locals()
if contig is not None:
z = zip(inames + onames, inputs + node.outputs)
cond1 = ' && '.join(["PyArray_ISCONTIGUOUS(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
cond2 = ' && '.join(["PyArray_ISFORTRAN(%s)" % arr
for arr, var in z
if not all(var.broadcastable)])
loop = """ loop = """
if(%(cond)s){ if((%(cond1)s) || (%(cond2)s)){
%(contig)s %(contig)s
}else{ }else{
%(loop)s %(loop)s
} }
""" % locals() """ % locals()
except theano.gof.utils.MethodNotDefined:
pass
return decl, checks, alloc, loop return decl, checks, alloc, loop
def c_code(self, node, nodename, inames, onames, sub): def c_code(self, node, nodename, inames, onames, sub):
...@@ -1140,7 +1193,7 @@ class Elemwise(Op): ...@@ -1140,7 +1193,7 @@ class Elemwise(Op):
return support_code return support_code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [8] # the version corresponding to the c code in this Op version = [11] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, scalar_node = Apply(self.scalar_op,
......
...@@ -113,9 +113,13 @@ def make_checks(loop_orders, dtypes, sub): ...@@ -113,9 +113,13 @@ def make_checks(loop_orders, dtypes, sub):
return init % sub + check % sub return init % sub + check % sub
def make_alloc(loop_orders, dtype, sub): def make_alloc(loop_orders, dtype, sub, fortran='0'):
""" """Generate C code to allocate outputs.
Generate C code to allocate outputs.
:param fortran: a string included in the generated code. If it
evaludate to non-zero, an ndarray in fortran order will be
created, otherwise it will be c order.
""" """
nd = len(loop_orders[0]) nd = len(loop_orders[0])
...@@ -133,7 +137,6 @@ def make_alloc(loop_orders, dtype, sub): ...@@ -133,7 +137,6 @@ def make_alloc(loop_orders, dtype, sub):
break break
else: else:
init_dims += "dims[%(i)s] = 1;\n" % locals() init_dims += "dims[%(i)s] = 1;\n" % locals()
#raise Exception("For each looping dimension, at least one input must have a non-broadcastable dimension.")
# TODO: it would be interesting to allocate the output in such a # TODO: it would be interesting to allocate the output in such a
# way that its contiguous dimensions match one of the input's # way that its contiguous dimensions match one of the input's
...@@ -146,7 +149,9 @@ def make_alloc(loop_orders, dtype, sub): ...@@ -146,7 +149,9 @@ def make_alloc(loop_orders, dtype, sub):
//npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp)); //npy_intp* dims = (npy_intp*)malloc(%(nd)s * sizeof(npy_intp));
%(init_dims)s %(init_dims)s
if (!%(olv)s) { if (!%(olv)s) {
%(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims, type_num_%(olv)s, 0); %(olv)s = (PyArrayObject*)PyArray_EMPTY(%(nd)s, dims,
type_num_%(olv)s,
%(fortran)s);
} }
else { else {
PyArray_Dims new_dims; PyArray_Dims new_dims;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论