Merge pull request #3656 from carriepl/scan_backend_speedup

Scan - Move costly checks from runtime to compilation

Merge pull request #3656 from carriepl/scan_backend_speedup
62ccf59f · Frédéric Bastien · 21adebb5 · fa0007df · 62ccf59f · 62ccf59f
--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -314,12 +314,23 @@ class Scan(PureOp):
            # Generate the mappings between inner and outer inputs and outputs
            # if they haven't already been generated.
            self.var_mappings = self.get_oinp_iinp_iout_oout_mappings()
-        if (hasattr(self, 'fn') and
+        if hasattr(self, 'fn'):
-                not hasattr(self, 'thunk_mit_mot_out_slices')):
+            if not hasattr(self, 'thunk_mit_mot_out_slices'):
-            # The thunk has been compiled before mit_mot preallocation feature
+                # The thunk has been compiled before mit_mot preallocation
-            # was implemented. Mark every mit_mot output tap as not having
+                # feature was implemented. Mark every mit_mot output tap as
-            # been preallocated
+                # not having been preallocated
-            self.mitmots_preallocated = [False] * self.n_mit_mot_outs
+                self.mitmots_preallocated = [False] * self.n_mit_mot_outs
+            if not hasattr(self, 'outs_on_gpu'):
+                # The thunk has been compiled before the analysis, at
+                # compilation time, of the location of the inputs and outputs.
+                # Perform this analysis here.
+                self.inps_on_gpu = [not isinstance(out,
+                                                   theano.tensor.TensorVariable)
+                                    for out in self.fn.maker.fgraph.inputs]
+                self.outs_on_gpu = [not isinstance(out,
+                                                   theano.tensor.TensorVariable)
+                                    for out in self.fn.maker.fgraph.outputs]
        # Ensure that the graph associated with the inner function is valid.
        self.validate_inner_graph()
@@ -858,6 +869,13 @@ class Scan(PureOp):
                               profile=profile,
                               on_unused_input='ignore')
+        # Analyse the compile inner function to determine which inputs and
+        # outputs are on the gpu and speed up some checks during the execution
+        self.inps_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
+                            for out in self.fn.maker.fgraph.inputs]
+        self.outs_on_gpu = [not isinstance(out, theano.tensor.TensorVariable)
+                            for out in self.fn.maker.fgraph.outputs]
        try:
            cython_mintaps = numpy.asarray(self.mintaps, dtype='int32')
            cython_tap_array_len = \
@@ -894,6 +912,9 @@ class Scan(PureOp):
            cython_mitmots_preallocated = numpy.asarray(self.mitmots_preallocated,
                                                        dtype='int32')
+            cython_inps_on_gpu = numpy.asarray(self.inps_on_gpu, dtype='int32')
+            cython_outs_on_gpu = numpy.asarray(self.outs_on_gpu, dtype='int32')
            if hasattr(self, 'destroy_map'):
                cython_destroy_map = [x in self.destroy_map
                                  for x in xrange(len(node.outputs))]
@@ -921,6 +942,8 @@ class Scan(PureOp):
                        cython_mit_mot_out_slices,
                        cython_mit_mot_out_nslices,
                        cython_mitmots_preallocated,
+                        cython_inps_on_gpu,
+                        cython_outs_on_gpu,
                        self.fn.fn,
                        self.fn,
                        cython_destroy_map,
@@ -1280,12 +1303,12 @@ class Scan(PureOp):
                var = output_storage[idx].storage[0]
                old_output_storage[idx] = var
-                if hasattr(var, 'gpudata'):
+                if var is None:
+                    old_output_data[idx] = None
+                elif self.outs_on_gpu[idx]:
                    old_output_data[idx] = var.gpudata
-                elif hasattr(var, 'data'):
-                    old_output_data[idx] = var.data
                else:
-                    old_output_data[idx] = None
+                    old_output_data[idx] = var.data
            # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
            # etc) associated with mitmot inputs currently in the
@@ -1298,12 +1321,12 @@ class Scan(PureOp):
                var = input_storage[idx + self.n_seqs].storage[0]
                old_mitmot_input_storage[idx] = var
-                if hasattr(var, 'gpudata'):
+                if var is None:
+                    old_mitmot_input_data[idx] = None
+                elif self.inps_on_gpu[idx]:
                    old_mitmot_input_data[idx] = var.gpudata
-                elif hasattr(var, 'data'):
-                    old_mitmot_input_data[idx] = var.data
                else:
-                    old_mitmot_input_data[idx] = None
+                    old_mitmot_input_data[idx] = var.data
            # 5.1 compute outputs
            t0_fn = time.time()
@@ -1365,9 +1388,9 @@ class Scan(PureOp):
                        new_var = input_storage[self.n_seqs + inp_idx].storage[0]
                        if old_var is new_var:
                            old_data = old_mitmot_input_data[inp_idx]
-                            if hasattr(new_var, 'gpudata'):
+                            if self.inps_on_gpu[self.n_seqs + inp_idx]:
                                same_data = (new_var.gpudata == old_data)
-                            elif hasattr(new_var, 'data'):
+                            else:
                                same_data = (new_var.data == old_data)
                        else:
                            same_data = False
@@ -1411,9 +1434,9 @@ class Scan(PureOp):
                        old_data = old_output_data[offset_out + j]
                        if old_data is None:
                            output_reused = False
-                        elif hasattr(new_var, 'gpudata'):
+                        elif self.outs_on_gpu[offset_out + j]:
                            output_reused = (new_var.gpudata == old_data)
-                        elif hasattr(new_var, 'data'):
+                        else:
                            output_reused = (new_var.data == old_data)
                    else:
                        output_reused = False
@@ -1454,9 +1477,9 @@ class Scan(PureOp):
                    if old_var is new_var:
                        if old_data is None:
                            output_reused = False
-                        elif hasattr(new_var, 'gpudata'):
+                        elif self.outs_on_gpu[offset_out + j]:
                            output_reused = (new_var.gpudata == old_data)
-                        elif hasattr(new_var, 'data'):
+                        else:
                            output_reused = (new_var.data == old_data)
                    else:
                        output_reused = False

--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy
 def get_version():
-    return 0.290
+    return 0.291
 @cython.boundscheck(False)
 def perform(
@@ -83,6 +83,8 @@ def perform(
            numpy.ndarray[numpy.int32_t,ndim=2] mit_mot_out_slices,
            numpy.ndarray[numpy.int32_t,ndim=1] mit_mot_out_nslices,
            numpy.ndarray[numpy.int32_t,ndim=1] mitmots_preallocated,
+            numpy.ndarray[numpy.int32_t,ndim=1] inps_on_gpu,
+            numpy.ndarray[numpy.int32_t,ndim=1] outs_on_gpu,
            fn,
            fnct,
            numpy.ndarray[numpy.int32_t,ndim=1] destroy_map,
@@ -136,6 +138,12 @@ def perform(
    mit_mot_out_nslices: int32 ndarray (Can be replaced by a list)
        Same as tap_array_len, but is the number of output taps of the
        mit_mot sequences (i.e. it corresponds to mit_mot_out_slices)
+    inps_on_gpu : int32 ndarray (Can be replaced by a list)
+        Array of boolean indicating, for every input, whether it is on the GPU
+        or not
+    outs_on_gpu : int32 ndarray (Can be replaced by a list)
+        Array of boolean indicating, for every output, whether it is on the GPU
+        or not
    fn: callable
        This is the linker, i.e. the function that will loop over the
        computational graph and call the perform of each operation. For this
@@ -358,12 +366,12 @@ def perform(
            var = output_storage[idx].storage[0]
            old_output_storage[idx] = var
-            if hasattr(var, 'gpudata'):
+            if var is None:
+                old_output_data[idx] = None
+            elif outs_on_gpu[idx]:
                old_output_data[idx] = var.gpudata
-            elif hasattr(var, 'data'):
-                old_output_data[idx] = var.data
            else:
-                old_output_data[idx] = None
+                old_output_data[idx] = var.data
        # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
        # etc) associated with mitmot inputs currently in the input_storage to
@@ -375,12 +383,12 @@ def perform(
            var = input_storage[idx + n_seqs].storage[0]
            old_mitmot_input_storage[idx] = var
-            if hasattr(var, 'gpudata'):
+            if var is None:
+                old_mitmot_input_data[idx] = None
+            elif inps_on_gpu[idx]:
                old_mitmot_input_data[idx] = var.gpudata
-            elif hasattr(var, 'data'):
-                old_mitmot_input_data[idx] = var.data
            else:
-                old_mitmot_input_data[idx] = None
+                old_mitmot_input_data[idx] = var.data
        # 5.1 compute outputs
        t0_fn = time.time()
@@ -442,9 +450,9 @@ def perform(
                    new_var = input_storage[n_seqs + inp_idx].storage[0]
                    if old_var is new_var:
                        old_data = old_mitmot_input_data[inp_idx]
-                        if hasattr(new_var, 'gpudata'):
+                        if inps_on_gpu[n_seqs + inp_idx]:
                            same_data = (new_var.gpudata == old_data)
-                        elif hasattr(new_var, 'data'):
+                        else:
                            same_data = (new_var.data == old_data)
                    else:
                        same_data = False
@@ -486,9 +494,9 @@ def perform(
                if old_var is new_var:
                    if old_data is None:
                        output_reused = False
-                    elif hasattr(new_var, 'gpudata'):
+                    elif outs_on_gpu[offset_out + j]:
                        output_reused = (new_var.gpudata == old_data)
-                    elif hasattr(new_var, 'data'):
+                    else:
                        output_reused = (new_var.data == old_data)
                else:
                    output_reused = False
@@ -528,9 +536,9 @@ def perform(
                if old_var is new_var:
                    if old_data is None:
                        output_reused = False
-                    elif hasattr(new_var, 'gpudata'):
+                    elif outs_on_gpu[offset_out + j]:
                        output_reused = (new_var.gpudata == old_data)
-                    elif hasattr(new_var, 'data'):
+                    else:
                        output_reused = (new_var.data == old_data)
                else:
                    output_reused = False

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -17,7 +17,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.scan_module.scan_perform')
-version = 0.290  # must match constant returned in function get_version()
+version = 0.291  # must match constant returned in function get_version()
 need_reload = False