More sane inplace optimization

The optimization before was all or nothing, which is a bad compromise for scan. This new optimization tries to work in place on each outputs, and keeps only those for which it can. This way you do not get your entire op to be non-inplace because of a single output that can not be computed inplace.

More sane inplace optimization
ed415454 · Razvan Pascanu · Razvan Pascanu · f57b7b77 · ed415454 · ed415454
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
 import logging
 _logger = logging.getLogger('theano.sandbox.cuda.opt')

+import copy
 import sys
 import warnings

 import numpy

 import theano
+from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano import scalar as scal
 from theano import tensor, compile, gof

 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
-                        EquilibriumOptimizer)
+                        InconsistencyError, EquilibriumOptimizer)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
            # merged or implement this optimization as a global
            # optimization
            thescan = host_input.owner.op
-            info = thescan.info.copy()
+            info = copy.deepcopy(thescan.info)
            info['gpu'] = True
            inputs = host_input.owner.inputs
            nw_ins = [inputs[0]]
@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
                      for i in node.inputs]):

            thescan = node.op
-            info = thescan.info.copy()
+            info = copy.deepcopy(thescan.info)
            info['gpu'] = True
            inputs = node.inputs
            nw_ins = [inputs[0]]
@@ -1527,41 +1529,9 @@ def gpuScanOptimization(node):
    return False


-@gof.local_optimizer([None])
-def gpu_scan_make_inplace(node):
-    op = node.op
-    if (isinstance(op, scan_op.Scan) and
-        (not op.info['inplace']) and
-        (op.info['gpu'])):
-        info = op.info.copy()
-        info['inplace'] = True
-        # inputs corresponding to sequences and n_steps
-        ls_begin = node.inputs[:1 + op.n_seqs]
-        ls = op.outer_mitmot(node)
-        ls += op.outer_mitsot(node)
-        ls += op.outer_sitsot(node)
-        ls_end = op.outer_shared(node)
-        ls_end += op.outer_nitsot(node)
-        ls_end += op.outer_non_seqs(node)
-        n_outs = len(ls)
-        for idx in xrange(n_outs):
-            if ls[idx] in ls[:idx]:
-                ls[idx] = compile.function_module.deep_copy_op(ls[idx])
-
-        inputs = ls_begin + ls + ls_end
-
-        typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
-                broadcastable=broadcastable)
-        new_op = scan_op.Scan(op.inputs,
-                              op.outputs,
-                              info,
-                              typeConstructor=typeConstructor)
-        return new_op.make_node(*inputs).outputs
-    return False
-
 optdb.register('gpu_scanOp_make_inplace',
-               theano.tensor.opt.in2out(
-                   gpu_scan_make_inplace, ignore_newtrees=True),
+               scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
+                                            gpu_flag=True),
               75,
               'gpu',
               'fast_run',

--- a/theano/sandbox/raw_scan.py
+++ b/theano/sandbox/raw_scan.py
--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -949,7 +949,8 @@ def scan(fn,
    info['truncate_gradient'] = truncate_gradient
    info['name'] = name
    info['mode'] = mode
-    info['inplace'] = False
+    info['inplace'] = -1
+    info['destroy_map'] = {}
    info['gpu'] = False
    info['as_while'] = as_while
    info['profile'] = profile

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -94,12 +94,6 @@ class Scan(PureOp):

        if self.as_while:
            self.output_types = self.output_types[:-1]
-        self.destroy_map = {}
-
-        if hasattr(self, 'inplace') and self.inplace:
-            for idx in xrange(self.n_mit_mot + self.n_mit_sot +
-                              self.n_sit_sot):
-                self.destroy_map[idx] = [idx + 1 + self.n_seqs]

        mode_instance = compile.mode.get_mode(self.mode)
        # if the default mode is used, and that mode is ProfileMode
@@ -411,12 +405,22 @@ class Scan(PureOp):
            name = 'do_while'
        else:
            name = 'for'
-
-        if self.inplace:
-            aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name))
+        aux_txt = '%s'
+        if len(self.destroy_map.keys()) > 0:
+            # Check if all outputs are inplace
+            if (sorted(self.destroy_map.keys()) == \
+               sorted(range(self.n_mit_mot +
+                            self.n_mit_sot +
+                            self.n_sit_sot))):
+                aux_txt += 'all_inplace,%s,%s}'
+            else:
+                aux_txt += '{inplace{'
+                for k in self.destroy_map.keys():
+                    aux_txt += str(k) + ','
+                aux_txt += '},%s,%s}'
        else:
-            aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name))
-
+            aux_txt +='{%s,%s}'
+        aux_txt = aux_txt % (name, gpu_str, str(self.name))
        return aux_txt

    def __hash__(self):

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
 __contact__ = "Razvan Pascanu <r.pascanu@gmail>"

 import logging
+import copy
 import numpy

 import theano
@@ -20,6 +21,8 @@ from theano import tensor
 from theano.tensor import opt, get_constant_value
 from theano import gof
 from theano.gof.python25 import maxsize
+from theano.gof.opt import Optimizer
+from theano.gof import toolbox, DestroyHandler, InconsistencyError
 from theano.compile import optdb
 from theano.compile.function_module import deep_copy_op

@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):

    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
-        nw_info = op.info.copy()
+        nw_info = copy.deepcopy(op.info)
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
@@ -304,36 +307,68 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
                     'scan')


-@gof.local_optimizer([None])
-def scan_make_inplace(node):
-    op = node.op
-    if (isinstance(op, scan_op.Scan) and
-        (not op.info['inplace']) and
-        (not op.info['gpu'])):
-        info = op.info.copy()
-        info['inplace'] = True
-        # inputs corresponding to sequences and n_steps
-        ls_begin = node.inputs[:1 + op.n_seqs]
-        ls = op.outer_mitmot(node.inputs)
-        ls += op.outer_mitsot(node.inputs)
-        ls += op.outer_sitsot(node.inputs)
-        ls_end = op.outer_shared(node.inputs)
-        ls_end += op.outer_nitsot(node.inputs)
-        ls_end += op.outer_non_seqs(node.inputs)
-        n_outs = len(ls)
-        for idx in xrange(n_outs):
-            if ls[idx] in ls[:idx]:
-                ls[idx] = deep_copy_op(ls[idx])
-
-        inputs = ls_begin + ls + ls_end
-        new_op = scan_op.Scan(op.inputs,
-                              op.outputs,
-                              info)
-        return new_op.make_node(*inputs).outputs
-    return False
+
+class ScanInplaceOptimizer(Optimizer):
+    """Graph optimizer for Scan(makes it run inplace)"""
+    def __init__(self, typeConstructor=None, gpu_flag=False):
+        Optimizer.__init__(self)
+        self.typeConstructor = typeConstructor
+        self.gpu_flag = gpu_flag
+
+    def add_requirements(self, env):
+        env.extend(toolbox.ReplaceValidate())
+        env.extend(DestroyHandler())
+
+    def apply(self, env):
+
+        nodes = env.toposort()
+        scan_nodes = [x for x in nodes
+                      if (isinstance(x.op, scan_op.Scan) and
+                         x.op.info['gpu']== self.gpu_flag)]
+        for scan_idx in xrange(len(scan_nodes)):
+            node = scan_nodes[scan_idx]
+            op = node.op
+            n_outs = (op.info['n_mit_mot'] +
+                      op.info['n_mit_sot'] +
+                      op.info['n_sit_sot'])
+            for pos in xrange(n_outs):
+                info = copy.deepcopy(op.info)
+                if not 'destroy_map' in info:
+                    info['destroy_map'] = {}
+                info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
+                # inputs corresponding to sequences and n_steps
+                ls_begin = node.inputs[:1 + op.n_seqs]
+                ls = op.outer_mitmot(node.inputs)
+                ls += op.outer_mitsot(node.inputs)
+                ls += op.outer_sitsot(node.inputs)
+                ls_end = op.outer_shared(node.inputs)
+                ls_end += op.outer_nitsot(node.inputs)
+                ls_end += op.outer_non_seqs(node.inputs)
+                n_outs = len(ls)
+                for idx in xrange(n_outs):
+                    if ls[idx] in ls[:idx]:
+                        ls[idx] = deep_copy_op(ls[idx])
+
+                inputs = ls_begin + ls + ls_end
+                new_op = scan_op.Scan(op.inputs,
+                                      op.outputs,
+                                      info,
+                                      typeConstructor=self.typeConstructor)
+
+                new_outs = new_op.make_node(*inputs).outputs
+                try:
+                    env.replace_all_validate(
+                        zip(node.outputs, new_outs),
+                        reason=self.__class__.__name__)
+                    op = new_op
+                    node = new_outs[0].owner
+                except InconsistencyError, e:
+                    # Failed moving output to be comptued inplace
+                    pass

 optdb.register('scanOp_make_inplace',
-               opt.in2out(scan_make_inplace, ignore_newtrees=True),
+               ScanInplaceOptimizer(typeConstructor=None,
+                                   gpu_flag=False),
               75,
               'fast_run',
               'inplace',

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
                             updates=updates,
                             mode=mode,
                             allow_input_downcast=True)
-
-       # compute output in numpy
+        scan_node = [x for x in f9.maker.env.toposort()
+                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
+        assert 0 in scan_node[0].op.destroy_map.keys()
+        assert 1 in scan_node[0].op.destroy_map.keys()
+        # compute output in numpy
        numpy_x0 = numpy.zeros((3,))
        numpy_x1 = numpy.zeros((3,))
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
                             mode=mode,
                             allow_input_downcast=True)

+        scan_node = [x for x in f9.maker.env.toposort()
+                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
+        assert 0 in scan_node[0].op.destroy_map.keys()
+        assert 1 in scan_node[0].op.destroy_map.keys()
       # compute output in numpy
        numpy_x0 = numpy.zeros((3,))
        numpy_x1 = numpy.zeros((3,))

--- a/theano/tensor/opt2.py
+++ b/theano/tensor/opt2.py