提交 a8fe1069 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Enable support for inplace Elemwise (and optimizations to use it).

上级 644dbb6c
import copy
from itertools import izip from itertools import izip
import numpy import numpy
...@@ -45,16 +46,11 @@ class GpuElemwise(Elemwise): ...@@ -45,16 +46,11 @@ class GpuElemwise(Elemwise):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, name=None, nfunc_spec=None):
# We do not support inplace since it is a lie anyway
# (the scalar_op code will never modify anything inplace)
Elemwise.__init__(self, scalar_op, inplace_pattern=None, name=name,
nfunc_spec=nfunc_spec)
def __str__(self): def __str__(self):
if self.name is not None: if self.name is not None:
return self.name return self.name
return "GpuElemwise{%s}<gpuarray>" % (self.scalar_op,) items = str(sorted(self.inplace_pattern.items()))
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def make_node(self, *inputs): def make_node(self, *inputs):
res = Elemwise.make_node(self, *inputs) res = Elemwise.make_node(self, *inputs)
...@@ -72,7 +68,7 @@ class GpuElemwise(Elemwise): ...@@ -72,7 +68,7 @@ class GpuElemwise(Elemwise):
scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs)] enumerate(node.outputs) if not n in self.inplace_pattern]
scal_out = [scalar.Scalar(o.dtype) for o in node.outputs] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
...@@ -96,9 +92,18 @@ class GpuElemwise(Elemwise): ...@@ -96,9 +92,18 @@ class GpuElemwise(Elemwise):
# The macro is fine, the C++ struct is not. # The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code) raise SupportCodeError(support_code)
scal_out = []
oi = 0
for n in range(len(fake_node.outputs)):
if n in self.inplace_pattern:
scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
else:
scal_out.append(outs[oi].name+'[i]')
oi += 1
kop = self.scalar_op.c_code(fake_node, nodename+'_scalar', kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
[i.name+'[i]' for i in inps], [i.name+'[i]' for i in inps],
[o.name+'[i]' for o in outs], scal_out,
dict(fail='return;')) dict(fail='return;'))
# Translate types for scalar composite ops (except complex). # Translate types for scalar composite ops (except complex).
...@@ -153,11 +158,15 @@ class GpuElemwise(Elemwise): ...@@ -153,11 +158,15 @@ class GpuElemwise(Elemwise):
out_shape.append(max(values)) out_shape.append(max(values))
out_shape = tuple(out_shape) out_shape = tuple(out_shape)
outs = [ensure_allocated(storage, out_shape, output.type.dtype) args = copy.copy(inputs)
for output, storage in izip(node.outputs, output_storage)] for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
if n in self.inplace_pattern:
stor[0] = inputs[self.inplace_pattern[n]]
else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype))
# the dict call is there to avoid a syntax error in python < 2.6 # the dict call is there to avoid a syntax error in python < 2.6
node._cache_elemwise_k(*(inputs+outs), **dict(broadcast=True)) node._cache_elemwise_k(*args, **dict(broadcast=True))
class SupportCodeError(Exception): class SupportCodeError(Exception):
......
import copy
import theano, numpy import theano, numpy
from theano import tensor from theano import tensor
from theano.compile import optdb from theano.compile import optdb
...@@ -134,7 +135,10 @@ def local_gpu_elemwise(node): ...@@ -134,7 +135,10 @@ def local_gpu_elemwise(node):
do_replace = False do_replace = False
if do_replace: if do_replace:
new_op = GpuElemwise(node.op.scalar_op) op = node.op
new_op = GpuElemwise(op.scalar_op, name=op.name,
inplace_pattern=copy.copy(op.inplace_pattern),
nfunc_spec=op.nfunc_spec)
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs)) gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
if gpu_out: if gpu_out:
return [gpu_elemwise] return [gpu_elemwise]
...@@ -166,3 +170,8 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( ...@@ -166,3 +170,8 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
71.00, 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpu') 71.00, 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpu')
inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise)
optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
'inplace_elemwise_optimizer', 'fast_run', 'inplace')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论