提交 40ca10ee authored 作者: Frederic's avatar Frederic

add amdlibm code for powx (pow(tensor, scalar))

上级 c77a8a1b
...@@ -850,8 +850,11 @@ class ScalarOp(Op): ...@@ -850,8 +850,11 @@ class ScalarOp(Op):
outputs are c_contiguous. This allow to use SIMD version outputs are c_contiguous. This allow to use SIMD version
of this op. of this op.
The inputs are the same as c_code EXCEPT that inp and out MUST The inputs are the same as c_code except:
be the variable name of the ndarray, not the current element.
- inp and out must be the variable name of the ndarray
- node must be the elemwise node. This is needed to know
the inputs/outputs type.
""" """
raise theano.gof.utils.MethodNotDefined() raise theano.gof.utils.MethodNotDefined()
...@@ -869,10 +872,11 @@ class UnaryScalarOp(ScalarOp): ...@@ -869,10 +872,11 @@ class UnaryScalarOp(ScalarOp):
node.inputs[0].type != node.outputs[0].type): node.inputs[0].type != node.outputs[0].type):
raise theano.gof.utils.MethodNotDefined() raise theano.gof.utils.MethodNotDefined()
if node.inputs[0].type == float32 and self.amd_float32 is not None: dtype = node.inputs[0].dtype
if dtype == 'float32' and self.amd_float32 is not None:
dtype = 'float' dtype = 'float'
fct = self.amd_float32 fct = self.amd_float32
elif node.inputs[0].type == float64 and self.amd_float64 is not None: elif dtype == 'float64' and self.amd_float64 is not None:
dtype = 'double' dtype = 'double'
fct = self.amd_float64 fct = self.amd_float64
else: else:
...@@ -1650,29 +1654,42 @@ class Pow(BinaryScalarOp): ...@@ -1650,29 +1654,42 @@ class Pow(BinaryScalarOp):
return (first_part, second_part) return (first_part, second_part)
def c_code_contiguous(self, node, name, (x, y), (z, ), sub): def c_code_contiguous(self, node, name, (x, y), (z, ), sub):
if (not theano.config.lib.amdlibm or if not theano.config.lib.amdlibm:
# We compare the dtype AND the broadcast flag
# as this function do not broadcast
node.inputs[0].type != node.outputs[0].type or
node.inputs[1].type != node.outputs[0].type):
raise theano.gof.utils.MethodNotDefined() raise theano.gof.utils.MethodNotDefined()
if node.inputs[0].type == float32 and self.amd_float32 is not None: # We compare the dtype AND the broadcast flag
# as this function do not broadcast
if (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].type == node.outputs[0].type and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
dtype = 'float' dtype = 'float'
fct = "amd_vrsa_powf" fct = "amd_vrsa_powf"
# amdlibm 3.0 do not have a float64 version of this SIMD function return """
#elif node.inputs[0].type == float64 and self.amd_float64 is not None:
# dtype = 'double'
# fct = self.amd_float64
else:
raise theano.gof.utils.MethodNotDefined()
return """
npy_intp n = PyArray_SIZE(%(z)s); npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s); %(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s); %(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s); %(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, y, z); %(fct)s(n, x, y, z);
""" % locals() """ % locals()
# We compare the dtype and check we broadcast a scalar
elif (node.inputs[0].type == node.outputs[0].type and
node.inputs[1].dtype == node.outputs[0].dtype and
all(node.inputs[1].broadcastable) and
# amdlibm 3.0 do not have a float64 version of this SIMD function
node.inputs[0].dtype == 'float32'):
dtype = 'float'
fct = "amd_vrsa_powxf"
return """
npy_intp n = PyArray_SIZE(%(z)s);
%(dtype)s * x = (%(dtype)s*) PyArray_DATA(%(x)s);
%(dtype)s * y = (%(dtype)s*) PyArray_DATA(%(y)s);
%(dtype)s * z = (%(dtype)s*) PyArray_DATA(%(z)s);
%(fct)s(n, x, *y, z);
""" % locals()
raise theano.gof.utils.MethodNotDefined()
pow = Pow(upcast_out, name='pow') pow = Pow(upcast_out, name='pow')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论