Merge pull request #620 from pascanur/better_inplace_scan3

Better inplace scan3

Merge pull request #620 from pascanur/better_inplace_scan3
8cbf087d · nouiz · 7fa8e678 · 1fa097dc · 8cbf087d · 8cbf087d
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
 import logging
 _logger = logging.getLogger('theano.sandbox.cuda.opt')

+import copy
 import sys
 import warnings

 import numpy

 import theano
+from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano import scalar as scal
 from theano import tensor, compile, gof

 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
-                        EquilibriumOptimizer)
+                        InconsistencyError, EquilibriumOptimizer)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -1431,7 +1433,7 @@ def gpuScanOptimization(node):
            # merged or implement this optimization as a global
            # optimization
            thescan = host_input.owner.op
-            info = thescan.info.copy()
+            info = copy.deepcopy(thescan.info)
            info['gpu'] = True
            inputs = host_input.owner.inputs
            nw_ins = [inputs[0]]
@@ -1478,7 +1480,7 @@ def gpuScanOptimization(node):
                      for i in node.inputs]):

            thescan = node.op
-            info = thescan.info.copy()
+            info = copy.deepcopy(thescan.info)
            info['gpu'] = True
            inputs = node.inputs
            nw_ins = [inputs[0]]
@@ -1512,11 +1514,10 @@ def gpuScanOptimization(node):
            typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
                    broadcastable=broadcastable)
            _outputs = scan_op.Scan(
-                    scan_ins,
-                    scan_outs,
-                    info,
-                    typeConstructor=typeConstructor).make_node(
-                        *nw_ins).outputs
+                scan_ins,
+                scan_outs,
+                info,
+                typeConstructor=typeConstructor).make_node(*nw_ins).outputs
            outputs = []
            for x, y in zip(_outputs, node.outputs):
                if isinstance(y.type, CudaNdarrayType):
@@ -1527,41 +1528,9 @@ def gpuScanOptimization(node):
    return False


-@gof.local_optimizer([None])
-def gpu_scan_make_inplace(node):
-    op = node.op
-    if (isinstance(op, scan_op.Scan) and
-        (not op.info['inplace']) and
-        (op.info['gpu'])):
-        info = op.info.copy()
-        info['inplace'] = True
-        # inputs corresponding to sequences and n_steps
-        ls_begin = node.inputs[:1 + op.n_seqs]
-        ls = op.outer_mitmot(node)
-        ls += op.outer_mitsot(node)
-        ls += op.outer_sitsot(node)
-        ls_end = op.outer_shared(node)
-        ls_end += op.outer_nitsot(node)
-        ls_end += op.outer_non_seqs(node)
-        n_outs = len(ls)
-        for idx in xrange(n_outs):
-            if ls[idx] in ls[:idx]:
-                ls[idx] = compile.function_module.deep_copy_op(ls[idx])
-
-        inputs = ls_begin + ls + ls_end
-
-        typeConstructor = lambda broadcastable, dtype: CudaNdarrayType(
-                broadcastable=broadcastable)
-        new_op = scan_op.Scan(op.inputs,
-                              op.outputs,
-                              info,
-                              typeConstructor=typeConstructor)
-        return new_op.make_node(*inputs).outputs
-    return False
-
 optdb.register('gpu_scanOp_make_inplace',
-               theano.tensor.opt.in2out(
-                   gpu_scan_make_inplace, ignore_newtrees=True),
+               scan_opt.ScanInplaceOptimizer(typeConstructor=CudaNdarrayType,
+                                            gpu_flag=True),
               75,
               'gpu',
               'fast_run',

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -949,7 +949,8 @@ def scan(fn,
    info['truncate_gradient'] = truncate_gradient
    info['name'] = name
    info['mode'] = mode
-    info['inplace'] = False
+    info['inplace'] = -1
+    info['destroy_map'] = {}
    info['gpu'] = False
    info['as_while'] = as_while
    info['profile'] = profile

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -94,12 +94,6 @@ class Scan(PureOp):

        if self.as_while:
            self.output_types = self.output_types[:-1]
-        self.destroy_map = {}
-
-        if hasattr(self, 'inplace') and self.inplace:
-            for idx in xrange(self.n_mit_mot + self.n_mit_sot +
-                              self.n_sit_sot):
-                self.destroy_map[idx] = [idx + 1 + self.n_seqs]

        mode_instance = compile.mode.get_mode(self.mode)
        # if the default mode is used, and that mode is ProfileMode
@@ -411,12 +405,24 @@ class Scan(PureOp):
            name = 'do_while'
        else:
            name = 'for'
-
-        if self.inplace:
-            aux_txt = '%s{inplace,%s,%s}' % (name, gpu_str, str(self.name))
+        aux_txt = '%s'
+        if getattr(self, 'destroy_map', None) is None:
+            self.destroy_map = {}
+        if len(self.destroy_map.keys()) > 0:
+            # Check if all outputs are inplace
+            if (sorted(self.destroy_map.keys()) == \
+               sorted(range(self.n_mit_mot +
+                            self.n_mit_sot +
+                            self.n_sit_sot))):
+                aux_txt += 'all_inplace,%s,%s}'
+            else:
+                aux_txt += '{inplace{'
+                for k in self.destroy_map.keys():
+                    aux_txt += str(k) + ','
+                aux_txt += '},%s,%s}'
        else:
-            aux_txt = '%s{%s,%s}' % (name, gpu_str, str(self.name))
-
+            aux_txt +='{%s,%s}'
+        aux_txt = aux_txt % (name, gpu_str, str(self.name))
        return aux_txt

    def __hash__(self):

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -13,6 +13,7 @@ __copyright__ = "(c) 2010, Universite de Montreal"
 __contact__ = "Razvan Pascanu <r.pascanu@gmail>"

 import logging
+import copy
 import numpy

 import theano
@@ -20,6 +21,8 @@ from theano import tensor
 from theano.tensor import opt, get_constant_value
 from theano import gof
 from theano.gof.python25 import maxsize
+from theano.gof.opt import Optimizer
+from theano.gof import toolbox, DestroyHandler, InconsistencyError
 from theano.compile import optdb
 from theano.compile.function_module import deep_copy_op

@@ -117,7 +120,7 @@ def remove_constants_and_unused_inputs_scan(node):

    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
-        nw_info = op.info.copy()
+        nw_info = copy.deepcopy(op.info)
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
@@ -304,36 +307,67 @@ scan_seqopt.register('scanOp_pushout_nonseqs_ops',
                     'scan')


-@gof.local_optimizer([None])
-def scan_make_inplace(node):
-    op = node.op
-    if (isinstance(op, scan_op.Scan) and
-        (not op.info['inplace']) and
-        (not op.info['gpu'])):
-        info = op.info.copy()
-        info['inplace'] = True
-        # inputs corresponding to sequences and n_steps
-        ls_begin = node.inputs[:1 + op.n_seqs]
-        ls = op.outer_mitmot(node.inputs)
-        ls += op.outer_mitsot(node.inputs)
-        ls += op.outer_sitsot(node.inputs)
-        ls_end = op.outer_shared(node.inputs)
-        ls_end += op.outer_nitsot(node.inputs)
-        ls_end += op.outer_non_seqs(node.inputs)
-        n_outs = len(ls)
-        for idx in xrange(n_outs):
-            if ls[idx] in ls[:idx]:
-                ls[idx] = deep_copy_op(ls[idx])
-
-        inputs = ls_begin + ls + ls_end
-        new_op = scan_op.Scan(op.inputs,
-                              op.outputs,
-                              info)
-        return new_op.make_node(*inputs).outputs
-    return False
+class ScanInplaceOptimizer(Optimizer):
+    """Graph optimizer for Scan(makes it run inplace)"""
+    def __init__(self, typeConstructor=None, gpu_flag=False):
+        Optimizer.__init__(self)
+        self.typeConstructor = typeConstructor
+        self.gpu_flag = gpu_flag
+
+    def add_requirements(self, env):
+        env.extend(toolbox.ReplaceValidate())
+        env.extend(DestroyHandler())
+
+    def apply(self, env):
+
+        nodes = env.toposort()
+        scan_nodes = [x for x in nodes
+                      if (isinstance(x.op, scan_op.Scan) and
+                         x.op.info['gpu'] == self.gpu_flag)]
+        for scan_idx in xrange(len(scan_nodes)):
+            node = scan_nodes[scan_idx]
+            op = node.op
+            n_outs = (op.info['n_mit_mot'] +
+                      op.info['n_mit_sot'] +
+                      op.info['n_sit_sot'])
+            for pos in xrange(n_outs):
+                info = copy.deepcopy(op.info)
+                if not 'destroy_map' in info:
+                    info['destroy_map'] = {}
+                info['destroy_map'][pos] = [pos + 1 + op.info['n_seqs']]
+                # inputs corresponding to sequences and n_steps
+                ls_begin = node.inputs[:1 + op.n_seqs]
+                ls = op.outer_mitmot(node.inputs)
+                ls += op.outer_mitsot(node.inputs)
+                ls += op.outer_sitsot(node.inputs)
+                ls_end = op.outer_shared(node.inputs)
+                ls_end += op.outer_nitsot(node.inputs)
+                ls_end += op.outer_non_seqs(node.inputs)
+                n_outs = len(ls)
+                for idx in xrange(n_outs):
+                    if ls[idx] in ls[:idx]:
+                        ls[idx] = deep_copy_op(ls[idx])
+
+                inputs = ls_begin + ls + ls_end
+                new_op = scan_op.Scan(op.inputs,
+                                      op.outputs,
+                                      info,
+                                      typeConstructor=self.typeConstructor)
+
+                new_outs = new_op.make_node(*inputs).outputs
+                try:
+                    env.replace_all_validate(
+                        zip(node.outputs, new_outs),
+                        reason=self.__class__.__name__)
+                    op = new_op
+                    node = new_outs[0].owner
+                except InconsistencyError, e:
+                    # Failed moving output to be comptued inplace
+                    pass

 optdb.register('scanOp_make_inplace',
-               opt.in2out(scan_make_inplace, ignore_newtrees=True),
+               ScanInplaceOptimizer(typeConstructor=None,
+                                   gpu_flag=False),
               75,
               'fast_run',
               'inplace',

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -775,8 +775,11 @@ class T_Scan(unittest.TestCase):
                             updates=updates,
                             mode=mode,
                             allow_input_downcast=True)
-
-       # compute output in numpy
+        scan_node = [x for x in f9.maker.env.toposort()
+                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
+        assert 0 in scan_node[0].op.destroy_map.keys()
+        assert 1 in scan_node[0].op.destroy_map.keys()
+        # compute output in numpy
        numpy_x0 = numpy.zeros((3,))
        numpy_x1 = numpy.zeros((3,))
        numpy_x0[0] = vu0[0] * vW_in + vx0 * vW + vu1[0] * vu2[0]
@@ -852,6 +855,10 @@ class T_Scan(unittest.TestCase):
                             mode=mode,
                             allow_input_downcast=True)

+        scan_node = [x for x in f9.maker.env.toposort()
+                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
+        assert 0 in scan_node[0].op.destroy_map.keys()
+        assert 1 in scan_node[0].op.destroy_map.keys()
       # compute output in numpy
        numpy_x0 = numpy.zeros((3,))
        numpy_x1 = numpy.zeros((3,))
@@ -880,6 +887,34 @@ class T_Scan(unittest.TestCase):
        #assert not numpy.allclose( theano_x0 , vu2[1:4])
        #assert numpy.allclose( theano_x1 , vu1[0:3])

+    def test_inplace3(self):
+        rng = numpy.random.RandomState(utt.fetch_seed())
+
+        vx0 = asarrayX(rng.uniform())
+        vx1 = asarrayX(rng.uniform())
+        x0 = theano.shared(vx0)
+        x1 = theano.shared(vx1)
+        outputs, updates = theano.scan(lambda x,y: (x + asarrayX(1),
+                                                    y + asarrayX(1)),
+                                       [],
+                                       [x0,x1],
+                                       n_steps = 3)
+        x0 = asarrayX(numpy.zeros((3,)))
+        x0[0] = vx0
+        x0 = theano.tensor.constant(x0)
+        to_replace = outputs[0].owner.inputs[0].owner.inputs[1]
+        outputs = theano.clone(outputs,
+                               replace={to_replace: x0})
+        mode = theano.compile.mode.get_mode(None).including('inplace')
+        f9 = theano.function([],
+                             outputs,
+                             updates=updates,
+                             mode=mode)
+        scan_node = [x for x in f9.maker.env.toposort()
+                     if isinstance(x.op, theano.scan_module.scan_op.Scan)]
+        assert 0 not in scan_node[0].op.destroy_map.keys()
+        assert 1 in scan_node[0].op.destroy_map.keys()
+
    # Shared variable with updates
    def test_shared_arguments_with_updates(self):
        rng = numpy.random.RandomState(utt.fetch_seed())