Enable support for inplace Elemwise (and optimizations to use it).

a8fe1069 · Arnaud Bergeron · 644dbb6c · a8fe1069 · a8fe1069
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
+import copy
 from itertools import izip

 import numpy
@@ -45,16 +46,11 @@ class GpuElemwise(Elemwise):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op, name=None, nfunc_spec=None):
-        # We do not support inplace since it is a lie anyway
-        # (the scalar_op code will never modify anything inplace)
-        Elemwise.__init__(self, scalar_op, inplace_pattern=None, name=name,
-                          nfunc_spec=nfunc_spec)
-
    def __str__(self):
        if self.name is not None:
            return self.name
-        return "GpuElemwise{%s}<gpuarray>" % (self.scalar_op,)
+        items = str(sorted(self.inplace_pattern.items()))
+        return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)

    def make_node(self, *inputs):
        res = Elemwise.make_node(self, *inputs)
@@ -72,7 +68,7 @@ class GpuElemwise(Elemwise):
        scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]

        outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                enumerate(node.outputs)]
+                enumerate(node.outputs) if not n in self.inplace_pattern]
        scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]

        fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
@@ -96,9 +92,18 @@ class GpuElemwise(Elemwise):
            # The macro is fine, the C++ struct is not.
            raise SupportCodeError(support_code)

+        scal_out = []
+        oi = 0
+        for n in range(len(fake_node.outputs)):
+            if n in self.inplace_pattern:
+                scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
+            else:
+                scal_out.append(outs[oi].name+'[i]')
+                oi += 1
+
        kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
                                    [i.name+'[i]' for i in inps],
-                                    [o.name+'[i]' for o in outs],
+                                    scal_out,
                                    dict(fail='return;'))

        # Translate types for scalar composite ops (except complex).
@@ -153,11 +158,15 @@ class GpuElemwise(Elemwise):
                out_shape.append(max(values))
        out_shape = tuple(out_shape)

-        outs = [ensure_allocated(storage, out_shape, output.type.dtype)
-                for output, storage in izip(node.outputs, output_storage)]
+        args = copy.copy(inputs)
+        for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
+            if n in self.inplace_pattern:
+                stor[0] = inputs[self.inplace_pattern[n]]
+            else:
+                args.append(ensure_allocated(stor, out_shape, out.type.dtype))

        # the dict call is there to avoid a syntax error in python < 2.6
-        node._cache_elemwise_k(*(inputs+outs), **dict(broadcast=True))
+        node._cache_elemwise_k(*args, **dict(broadcast=True))


 class SupportCodeError(Exception):

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
+import copy
 import theano, numpy
 from theano import tensor
 from theano.compile import optdb
@@ -134,7 +135,10 @@ def local_gpu_elemwise(node):
            do_replace = False

    if do_replace:
-        new_op = GpuElemwise(node.op.scalar_op)
+        op = node.op
+        new_op = GpuElemwise(op.scalar_op, name=op.name,
+                             inplace_pattern=copy.copy(op.inplace_pattern),
+                             nfunc_spec=op.nfunc_spec)
        gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
        if gpu_out:
            return [gpu_elemwise]
@@ -166,3 +170,8 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
 optdb.register('gpu_elemwise_fusion',
               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
               71.00, 'fast_run', 'fusion', 'local_elemwise_fusion', 'gpu')
+
+inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
+    GpuElemwise)
+optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
+               'inplace_elemwise_optimizer', 'fast_run', 'inplace')