make GpuElemwise optimization to work inplace as Elemwise.

a7ca0c4f · Frederic Bastien · 44af7bc1 · a7ca0c4f · a7ca0c4f · a7ca0c4f
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -85,13 +85,9 @@ class GpuElemwise(Op):
        #
        sync = config.gpuelemwise.sync
        self.scalar_op = scalar_op
-        if 0:
-            #we don't put them their as this cause trouble with the local_cut_gpu_host_gpu optimizer.
+        self.inplace_pattern = inplace_pattern
-            #and the gpu don't implement any inplace pattern for now.
+        self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
-            self.inplace_pattern = inplace_pattern
-            self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
-        else:
-            self.inplace_pattern = {}
        self.sync = sync

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -89,7 +89,8 @@ def local_gpu_elemwise_0(node):
    if isinstance(node.op, tensor.Elemwise):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
-                new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
+                #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
+                new_op = GpuElemwise(node.op.scalar_op)
                # case 1 - all inputs are already float32
                if numpy.all([i.type.dtype == 'float32' for i in node.inputs]):
@@ -120,7 +121,8 @@ def local_gpu_elemwise_1(node):
        host_i, = node.inputs
        if host_i.owner and isinstance(host_i.owner.op, tensor.Elemwise) and len(host_i.clients)==1:
            elemwise_node = host_i.owner
-            new_op = GpuElemwise(elemwise_node.op.scalar_op, elemwise_node.op.inplace_pattern)
+            #don't set any inplace pattern. gpu_insert_inplace_optimizer will do it later
+            new_op = GpuElemwise(elemwise_node.op.scalar_op)
            if all([i.dtype=='float32' for i in elemwise_node.inputs]):
                return [new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])]
    return False
@@ -629,6 +631,9 @@ else:
    _logger.debug("not enabling optimization fusion of gpu elemwise in fast_run")
    compile.optdb.register('gpu_elemwise_fusion', tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00, 'fusion', 'local_elemwise_fusion')
+#GpuElemwise inplace
+gpu_insert_inplace_optimizer = tensor.opt.insert_inplace_optimizer_op(GpuElemwise)
+compile.optdb.register('gpu_inplace_opt', gpu_insert_inplace_optimizer, 75, 'fast_run', 'inplace','gpu_inplace') 
 @register_opt()
 @local_optimizer([tensor.Alloc])

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -217,6 +217,9 @@ def test_elemwise0():
    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
+    #check that we work inplace.
+    assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])]
    a0 = a.value * 1.0
    print 'BEFORE ADD', a.value
    for i, node in enumerate(f.maker.env.toposort()):

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -98,112 +98,120 @@ theano.configparser.AddConfigVar('tensor.insert_inplace_optimizer_validate_nb',
        "-1: auto, if graph have less then 500 nodes 1, else 10",
        theano.configparser.IntParam(-1))
-@gof.optimizer
+def insert_inplace_optimizer_op(OP):
-def insert_inplace_optimizer(env):
    """
-    Usage: inplace_optimizer.optimize(env)
+    We parametrise it to make it work for Elemwise and GpuElemwise op.
+    """
-    Attempts to replace all Broadcast ops by versions of them
+    @gof.optimizer
-    that operate inplace. It operates greedily: for each Broadcast
+    def insert_inplace_optimizer(env):
-    Op that is encountered, for each output, tries each input to
+        """
-    see if it can operate inplace on that input. If so, makes the
+        Usage: inplace_optimizer.optimize(env)
-    change and go to the next output or Broadcast Op.
-    Examples:
+        Attempts to replace all Broadcast ops by versions of them
-      x + y + z -> x += y += z
+        that operate inplace. It operates greedily: for each Broadcast
-      (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
+        Op that is encountered, for each output, tries each input to
-    """
+        see if it can operate inplace on that input. If so, makes the
-    #we should not validate too often as this take too much time to execute!
+        change and go to the next output or Broadcast Op.
-    #It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
-    #that take so much time. 
-    #Should we try to use another lib that do toposort? 
-    #   igraph: http://igraph.sourceforge.net/
-    #   networkx: https://networkx.lanl.gov/
-    #Should we try to use cython? 
-    #   compiling only that fct is not enought, should we try to add the deque class too?
-    #   and init the deque and other list to an upper bound number of element?
-    #Should Theano do online toposort as in http://code.google.com/p/acyclic/?
-    #
-    #The next longuest optimizer is the canonizer phase
-    #Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
-    #we execute validate after this number of change.
-    validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
-    if validate_each_change==-1:
-        if len(env.nodes)>500:
-            validate_each_change = 10
-        else: validate_each_change = 1
-    nb_change_no_validate = 0
-    chk = env.checkpoint()
-    for node in list(graph.io_toposort(env.inputs, env.outputs)):
-        op = node.op
-        if not isinstance(op, Elemwise):
-            continue
-        baseline = op.inplace_pattern
-        protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
-        protected_inputs = sum(protected_inputs,[])#flatten the list
-        protected_inputs.extend(env.outputs)
-        candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
-        #node inputs that are Constant, already destroyed,
-        # env protected inputs and env outputs can't be used as inplace target.
-        # Remove here as faster.
-        candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
-                                and not isinstance(node.inputs[i],Constant)\
-                                and not env.destroyers(node.inputs[i])\
-                                and node.inputs[i] not in protected_inputs]
-        verbose = False
-        raised_warning = not verbose
-        for candidate_output in candidate_outputs:
-            for candidate_input in candidate_inputs:
-                #remove inputs that don't have the same dtype as the output.
-                if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
-                    continue
-                inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
-                try:
-                    if hasattr(op.scalar_op,"make_new_inplace"):
-                        new_scal = op.scalar_op.make_new_inplace(
-                            scalar.transfer_type(
-                                *[inplace_pattern.get(i, None) \
-                                      for i in xrange(len(node.outputs))]))
-                    else:
-                        new_scal = op.scalar_op.__class__(
-                            scalar.transfer_type(
-                                *[inplace_pattern.get(i, None) \
-                                      for i in xrange(len(node.outputs))]))
-                    new = Elemwise(new_scal,inplace_pattern).make_node(*node.inputs)
-                    for r,new_r in zip(node.outputs,new.outputs):
-                        env.replace(r,new_r,
-                                    reason="insert_inplace_optimizer")
-                    nb_change_no_validate +=1
-                    if nb_change_no_validate >= validate_each_change:
-                        env.validate()
-                        chk = env.checkpoint()
-                        nb_change_no_validate = 0
-                except (ValueError, TypeError, InconsistencyError), e:
-                    if validate_each_change!=1 and not raised_warning:
-                        print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
-                        print >> sys.stderr, e
-                        raised_warning = True
-                    env.revert(chk)
-                    continue
-                candidate_inputs.remove(candidate_input)
-                node = new
-                baseline = inplace_pattern
-                break
-    if nb_change_no_validate>0:
+        Examples:
-        try:
+          x + y + z -> x += y += z
-            env.validate()
+          (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
-        except Exception, e:
+        """
-            if not raised_warning:
+        #we should not validate too often as this take too much time to execute!
-                print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error"
+        #It is the _dfs_toposort() fct in theano/gof/destroyhandler.py
-            env.revert(chk)
+        #that take so much time. 
+        #Should we try to use another lib that do toposort? 
+        #   igraph: http://igraph.sourceforge.net/
+        #   networkx: https://networkx.lanl.gov/
+        #Should we try to use cython? 
+        #   compiling only that fct is not enought, should we try to add the deque class too?
+        #   and init the deque and other list to an upper bound number of element?
+        #Should Theano do online toposort as in http://code.google.com/p/acyclic/?
+        #
+        #The next longuest optimizer is the canonizer phase
+        #Then I think it is the [io_?]toposort(need to validate) so check if the solution is also applicable their.
+        #we execute validate after this number of change.
+        validate_each_change = config.tensor.insert_inplace_optimizer_validate_nb
+        if validate_each_change==-1:
+            if len(env.nodes)>500:
+                validate_each_change = 10
+            else: validate_each_change = 1
+        nb_change_no_validate = 0
+        chk = env.checkpoint()
+        for node in list(graph.io_toposort(env.inputs, env.outputs)):
+            op = node.op
+            if not isinstance(op, OP):
+                continue
+            baseline = op.inplace_pattern
+            protected_inputs = [f.protected for f in node.env._features if isinstance(f,theano.compile.function_module.Supervisor)]
+            protected_inputs = sum(protected_inputs,[])#flatten the list
+            protected_inputs.extend(env.outputs)
+            candidate_outputs = [i for i in xrange(len(node.outputs)) if i not in baseline]
+            #node inputs that are Constant, already destroyed,
+            # env protected inputs and env outputs can't be used as inplace target.
+            # Remove here as faster.
+            candidate_inputs = [i for i in xrange(len(node.inputs)) if i not in baseline.values() \
+                                    and not isinstance(node.inputs[i],Constant)\
+                                    and not env.destroyers(node.inputs[i])\
+                                    and node.inputs[i] not in protected_inputs]
+            verbose = False
+            raised_warning = not verbose
+            for candidate_output in candidate_outputs:
+                for candidate_input in candidate_inputs:
+                    #remove inputs that don't have the same dtype as the output.
+                    if node.inputs[candidate_input].type!=node.outputs[candidate_output].type:
+                        continue
+                    inplace_pattern = dict(baseline, **{candidate_output: candidate_input})
+                    try:
+                        if hasattr(op.scalar_op,"make_new_inplace"):
+                            new_scal = op.scalar_op.make_new_inplace(
+                                scalar.transfer_type(
+                                    *[inplace_pattern.get(i, None) \
+                                          for i in xrange(len(node.outputs))]))
+                        else:
+                            new_scal = op.scalar_op.__class__(
+                                scalar.transfer_type(
+                                    *[inplace_pattern.get(i, None) \
+                                          for i in xrange(len(node.outputs))]))
+                        new = OP(new_scal,inplace_pattern).make_node(*node.inputs)
+                        for r,new_r in zip(node.outputs,new.outputs):
+                            env.replace(r,new_r,
+                                        reason="insert_inplace_optimizer")
+                        nb_change_no_validate +=1
+                        if nb_change_no_validate >= validate_each_change:
+                            env.validate()
+                            chk = env.checkpoint()
+                            nb_change_no_validate = 0
+                    except (ValueError, TypeError, InconsistencyError), e:
+                        if validate_each_change!=1 and not raised_warning:
+                            print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error:"
+                            print >> sys.stderr, e
+                            raised_warning = True
+                        env.revert(chk)
+                        continue
+                    candidate_inputs.remove(candidate_input)
+                    node = new
+                    baseline = inplace_pattern
+                    break
+        if nb_change_no_validate>0:
+            try:
+                env.validate()
+            except Exception, e:
+                if not raised_warning:
+                    print >> sys.stderr, "Their was some inplace optimization that was not done due to unexpected error"
+                env.revert(chk)
+    return insert_inplace_optimizer
+insert_inplace_optimizer = insert_inplace_optimizer_op(T.Elemwise)
 compile.optdb.register('inplace_opt', insert_inplace_optimizer, 75, 'fast_run', 'inplace')