提交 c5bae0fb authored 作者: Frederic's avatar Frederic

Make opt to use GpuCAReduceCUDA pre_scalar_op with the sqr op.

上级 daf196e6
...@@ -22,6 +22,15 @@ from type import GpuArrayType ...@@ -22,6 +22,15 @@ from type import GpuArrayType
def as_gpuarray_variable(x): def as_gpuarray_variable(x):
# This is needed to lower the number of useless transfer
# introduced during optimization. This speed up optimization and
# "canonicalize" the graph, so it make easier making some
# optimization.
if (hasattr(x, 'fgraph') and
len(x.clients) == 1 and
x.owner and
isinstance(x.owner.op, HostFromGpu)):
return x.owner.inputs[0]
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable()
# TODO we need to have the cuda -> gpu path taken care of. # TODO we need to have the cuda -> gpu path taken care of.
......
...@@ -563,6 +563,27 @@ def local_gpu_conv(node): ...@@ -563,6 +563,27 @@ def local_gpu_conv(node):
return [out] return [out]
@register_opt("low_memory")
@local_optimizer([GpuCAReduceCuda])
def local_gpu_elemwise_careduce(node):
""" Merge some GpuCAReduceCuda and GPUElemwise"""
if (isinstance(node.op, GpuCAReduceCuda) and
node.op.pre_scalar_op is None and
node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, GpuElemwise) and
# The Op support all scalar with 1 inputs. We don't
# automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result
# to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x): def tensor_to_gpu(x):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable, y = GpuArrayType(broadcastable=x.type.broadcastable,
......
...@@ -133,3 +133,13 @@ def test_print_op(): ...@@ -133,3 +133,13 @@ def test_print_op():
assert isinstance(topo[2].op, GpuElemwise) assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu assert topo[3].op == host_from_gpu
f(numpy.random.random((5, 5)).astype('float32')) f(numpy.random.random((5, 5)).astype('float32'))
def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix()
o = (x*x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert topo[1].op.pre_scalar_op == theano.scalar.sqr
f(numpy.random.rand(3, 4).astype(theano.config.floatX))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论