Speedup Scan's cython backend

752661f6 · carriepl · b66e7c56 · 752661f6 · 752661f6 · 752661f6
--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy


 def get_version():
-    return 0.287
+    return 0.288

 @cython.boundscheck(False)
 def perform(
@@ -195,8 +195,6 @@ def perform(
    cdef unsigned int len_output_storage = (n_mit_mot_outs + n_mit_sot +
                                            n_sit_sot + n_nit_sot +
                                            n_shared_outs)
-    cdef int input_reused[500] # max 500 inputs
-    cdef int output_reused[500] # max 500 outputs


    if n_steps < 0:
@@ -256,9 +254,11 @@ def perform(
    offset = nit_sot_arg_offset + n_nit_sot
    other_args = args[offset:]
    input_storage = fnct.input_storage
-    len_input_storage = len(input_storage)
-    old_input_storage = [None] * len_input_storage
-    old_input_data = [None] * len_input_storage
+    nb_mitmot_in = 0
+    for idx in range(n_mit_mot):
+        nb_mitmot_in += tap_array_len[idx]
+    old_mitmot_input_storage = [None] * nb_mitmot_in
+    old_mitmot_input_data = [None] * nb_mitmot_in
    output_storage = fnct.output_storage
    old_output_storage = [None] * len_output_storage
    old_output_data = [None] * len_output_storage
@@ -366,21 +366,21 @@ def perform(
                old_output_data[idx] = None

        # 4.6. Keep a reference to the variables (ndarrays, CudaNdarrays,
-        # etc) currently in the input_storage to be able to compare them
-        # with the content of the input_storage after the execution of the
-        # function. Also keep pointers to their data to be able to detect
-        # cases where outputs reused the allocated object but alter the
-        # memory region they refer to.
-        for idx in xrange(len(input_storage)):
-            var = input_storage[idx].storage[0]
-            old_input_storage[idx] = var
+        # etc) associated with mitmot inputs currently in the input_storage to
+        # be able to compare them with the content of the input_storage after
+        # the execution of the function. Also keep pointers to their data to
+        # be able to detect cases where outputs reused the allocated object
+        # but alter the memory region they refer to.
+        for idx in xrange(nb_mitmot_in):
+            var = input_storage[idx + n_seqs].storage[0]
+            old_mitmot_input_storage[idx] = var

            if hasattr(var, 'gpudata'):
-                old_input_data[idx] = var.gpudata
+                old_mitmot_input_data[idx] = var.gpudata
            elif hasattr(var, 'data'):
-                old_input_data[idx] = var.data
+                old_mitmot_input_data[idx] = var.data
            else:
-                old_input_data[idx] = None
+                old_mitmot_input_data[idx] = None

        # 5.1 compute outputs
        t0_fn = time.time()
@@ -415,66 +415,35 @@ def perform(
                    storage.data = output_storage[offset_out].data
                    offset_out -= 1

-        # 5.3. Check which of the pre-allocated outputs (if applicable)
-        # have been reused by the inner function
-        for idx in range(len_output_storage):
-            # If the storage map does not contain the same object, then
-            # the pre-allocated output has not been reused
-            new_var = output_storage[idx].storage[0]
-            if old_output_storage[idx] is new_var:
-
-                # The pre-allocated output is only considered as having
-                # been reused if it still points to the same data as it
-                # did before the execution of the inner function
-                if old_output_data[idx] is None:
-                    output_reused[idx] = False
-                else:
-                    if hasattr(new_var, 'gpudata'):
-                        output_reused[idx] = (new_var.gpudata ==
-                                              old_output_data[idx])
-                    elif hasattr(new_var, 'data'):
-                        output_reused[idx] = (new_var.data ==
-                                              old_output_data[idx])
-            else:
-                output_reused[idx] = False
-
-        # 5.4. Check which of the input storage have been modified by the
-        # inner function
-        for idx in xrange(len(input_storage)):
-            # If the storage map does not contain the same object, then
-            # the pre-allocated output has not been reused
-            new_var = input_storage[idx].storage[0]
-            if old_input_storage[idx] is new_var:
-
-                # The pre-allocated output is only considered as having
-                # been reused if it still points to the same data as it
-                # did before the execution of the inner function
-                if old_input_data[idx] is None:
-                    input_reused[idx] = False
-                else:
-                    if hasattr(new_var, 'gpudata'):
-                        input_reused[idx] = (new_var.gpudata ==
-                                             old_input_data[idx])
-                    elif hasattr(new_var, 'data'):
-                        input_reused[idx] = (new_var.data ==
-                                             old_input_data[idx])
-            else:
-                input_reused[idx] = False
-
        offset_out = 0
-        # 5.5 Copy over the values for mit_mot outputs
-        mitmot_inp_offset = self.n_seqs
+
+        # 5.3 Copy over the values for mit_mot outputs
+        mitmot_inp_offset = 0
        mitmot_out_idx = 0
        for j in xrange(self.n_mit_mot):
            for k in self.mit_mot_out_slices[j]:
                if mitmots_preallocated[<unsigned int>mitmot_out_idx]:
-                    # This output tap has been preallocated. If the
-                    # corresponding input storage has been replaced,
-                    # recover the value as usual. Otherwise, the input was
-                    # modified inplace and nothing needs to be done.
+                    # This output tap has been preallocated.
                    inp_idx = (mitmot_inp_offset +
                               self.tap_array[j].index(k))
-                    if not input_reused[inp_idx]:
+
+                    # Verify whether the input points to the same data as
+                    # it did before the execution of the inner function.
+                    old_var = old_mitmot_input_storage[inp_idx]
+                    new_var = input_storage[n_seqs + inp_idx].storage[0]
+                    if old_var is new_var:
+                        old_data = old_mitmot_input_data[inp_idx]
+                        if hasattr(new_var, 'gpudata'):
+                            same_data = (new_var.gpudata == old_data)
+                        elif hasattr(new_var, 'data'):
+                            same_data = (new_var.data == old_data)
+                    else:
+                        same_data = False
+
+                    # If the corresponding input storage has been replaced,
+                    # recover the value as usual. Otherwise, the input was
+                    # modified inplace and nothing needs to be done.
+                    if not same_data:
                        outs[j][0][<unsigned int>(k + pos[j])] = \
                            input_storage[<unsigned int>inp_idx].storage[0]

@@ -489,21 +458,52 @@ def perform(

            mitmot_inp_offset += len(self.tap_array[j])

-        # 5.6 Copy over the values for mit_sot/sit_sot outputs
+        # 5.4 Copy over the values for mit_sot/sit_sot outputs
        begin = n_mit_mot
        end   = n_outs
        offset_out -= n_mit_mot

        for j in range(begin, end):
-            if (store_steps[j] == 1 or vector_outs[j] == 1 or
-                not output_reused[<unsigned int>(offset_out+j)]):

+            # Check whether the initialization of the output storage map
+            # for this output has been reused.
+            old_var = old_output_storage[offset_out + j]
+            old_data = old_output_data[offset_out + j]
+            new_var = output_storage[offset_out + j].storage[0]
+            if old_var is new_var:
+                if old_data is None:
+                    output_reused = False
+                elif hasattr(new_var, 'gpudata'):
+                    output_reused = (new_var.gpudata == old_data)
+                elif hasattr(new_var, 'data'):
+                    output_reused = (new_var.data == old_data)
+            else:
+                output_reused = False
+
+            # Copy the output value to `outs`, if necessary
+            if store_steps[j] == 1 or vector_outs[j] == 1 or not output_reused:
                outs[j][0][pos[j]] = output_storage[<unsigned int>(offset_out+j)].storage[0]

-        # 5.7 Copy over the values for nit_sot outputs
+        # 5.5 Copy over the values for nit_sot outputs
        begin  = end
        end   += n_nit_sot
        for j in range(begin,end):
+
+            # Check whether the initialization of the output storage map
+            # for this output has been reused.
+            old_var = old_output_storage[offset_out + j]
+            old_data = old_output_data[offset_out + j]
+            new_var = output_storage[offset_out + j].storage[0]
+            if old_var is new_var:
+                if old_data is None:
+                    output_reused = False
+                elif hasattr(new_var, 'gpudata'):
+                    output_reused = (new_var.gpudata == old_data)
+                elif hasattr(new_var, 'data'):
+                    output_reused = (new_var.data == old_data)
+            else:
+                output_reused = False
+
            if i == 0:
                jout = j+offset_out
                shape = (store_steps[j],) + output_storage[jout].storage[0].shape
@@ -519,10 +519,10 @@ def perform(
                    outs[j][0] = outs[j][0][:store_steps[j]]
                outs[j][0][pos[j]] = output_storage[jout].storage[0]
            elif (store_steps[j] == 1 or vector_outs[j] == 1 or
-                  not output_reused[<unsigned int>(offset_out+j)]):
+                  not output_reused):
                outs[j][0][pos[j]] = output_storage[j+offset_out].storage[0]

-        # 5.8 Copy over the values for outputs corresponding to shared
+        # 5.6 Copy over the values for outputs corresponding to shared
        # variables
        begin  = end
        end   += n_shared_outs

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -17,7 +17,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.scan_module.scan_perform')


-version = 0.287  # must match constant returned in function get_version()
+version = 0.288  # must match constant returned in function get_version()

 need_reload = False