Merge pull request #4831 from nouiz/compiledir_format

Add device in the compiledir_format

Merge pull request #4831 from nouiz/compiledir_format
539ca7eb · abergeron · GitHub · e3ccdb37 · ca29ed9d · 539ca7eb
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1769,12 +1769,13 @@ class _Linker(gof.link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            assert type(self) is _Linker
-            return type(self)(maker=self.maker).accept(fgraph, no_recycling)
+            return type(self)(maker=self.maker).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1500,9 +1500,10 @@ class FunctionMaker(object):
                     if not spec.borrow]
        if no_borrow:
            self.linker = linker.accept(
-                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow))
+                fgraph, no_recycling=infer_reuse_pattern(fgraph, no_borrow),
+                profile=profile)
        else:
-            self.linker = linker.accept(fgraph)
+            self.linker = linker.accept(fgraph, profile=profile)

        if hasattr(linker, 'accept_var_updates'):
            # hacky thing so VMLinker knows about updates

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -72,7 +72,8 @@ def _atexit_print_fn():
        for ps in to_sum[1:]:
            for attr in ["compile_time", "fct_call_time", "fct_callcount",
                         "vm_call_time", "optimizer_time", "linker_time",
-                         "validate_time", "import_time"]:
+                         "validate_time", "import_time",
+                         "linker_node_make_thunks"]:
                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))

            # merge dictonary
@@ -190,6 +191,8 @@ class ProfileStats(object):
    import_time = 0.0
    # time spent in importing compiled python module.

+    linker_node_make_thunks = 0.0
+
    line_width = config.profiling.output_line_width

    nb_nodes = -1
@@ -665,6 +668,8 @@ class ProfileStats(object):
        print('    Theano Linker time (includes C, CUDA code '
              'generation/compiling): %es' % self.linker_time, file=file)
        print('       Import time %es' % self.import_time, file=file)
+        print('       Node make_thunk time %es' % self.linker_node_make_thunks,
+              file=file)
        print('', file=file)

        # The validation time is a subset of optimizer_time

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -1630,6 +1630,8 @@ def short_platform(r=None, p=None):

    return p
 compiledir_format_dict['short_platform'] = short_platform()
+# Allow to have easily one compiledir per device.
+compiledir_format_dict['device'] = config.device
 compiledir_format_keys = ", ".join(sorted(compiledir_format_dict.keys()))
 default_compiledir_format = ("compiledir_%(short_platform)s-%(processor)s-"
                             "%(python_version)s-%(python_bitwidth)s")

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -548,7 +548,7 @@ class CLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph

@@ -557,7 +557,8 @@ class CLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            # A linker can be tied to only one FunctionGraph.
-            return type(self)(self.schedule).accept(fgraph, no_recycling)
+            return type(self)(self.schedule).accept(
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.fetch_variables()
        self.no_recycling = no_recycling
@@ -1737,7 +1738,7 @@ class OpWiseCLinker(link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Associate linker with fgraph
        """
@@ -1750,7 +1751,7 @@ class OpWiseCLinker(link.LocalLinker):
                allow_gc=self.allow_gc,
                nice_errors=self.nice_errors,
                schedule=self.schedule,
-            ).accept(fgraph, no_recycling)
+            ).accept(fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self
@@ -1897,7 +1898,7 @@ class DualLinker(link.Linker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Update/tie self with fgraph
        """
@@ -1905,7 +1906,7 @@ class DualLinker(link.Linker):
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
            return type(self)(self.checker, self.schedule).accept(
-                fgraph, no_recycling)
+                fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -762,7 +762,7 @@ class PerformLinker(LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters
@@ -781,7 +781,8 @@ class PerformLinker(LocalLinker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
-            return type(self)(allow_gc=self.allow_gc).accept(fgraph, no_recycling)
+            return type(self)(allow_gc=self.allow_gc).accept(
+                fgraph, no_recycling, profile)
            # raise Exception("Cannot accept from a Linker that is already tied to another FunctionGraph.")
        self.fgraph = fgraph
        self.no_recycling = no_recycling
@@ -944,7 +945,7 @@ class WrapLinker(Linker):
            linkers=[l.clone(allow_gc=allow_gc) for l in self.linkers],
            wrapper=self.wrapper)

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """

        Parameters

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -731,7 +731,7 @@ class VM_Linker(link.LocalLinker):
        if schedule:
            self.schedule = schedule

-    def accept(self, fgraph, no_recycling=None):
+    def accept(self, fgraph, no_recycling=None, profile=None):
        """
        Check if fgraph is the first FunctionGraph that has ever been
        associated to self, else, create a new VM_Linker
@@ -779,9 +779,11 @@ class VM_Linker(link.LocalLinker):
                schedule=self.schedule,
                c_thunks=self.c_thunks,
                allow_partial_eval=self.allow_partial_eval
-            ).accept(fgraph, no_recycling)
+            ).accept(fgraph, no_recycling, profile)
        self.fgraph = fgraph
        self.no_recycling = no_recycling
+        self.profile = profile
+
        return self

    def accept_var_updates(self, updated_vars):
@@ -1038,7 +1040,7 @@ class VM_Linker(link.LocalLinker):

        reallocated_info = calculate_reallocate_info(
            order, fgraph, storage_map, compute_map_re, dependencies)
-
+        t0 = time.time()
        for node in order:
            try:
                if self.c_thunks is False:
@@ -1056,6 +1058,11 @@ class VM_Linker(link.LocalLinker):
                e.args = ("The following error happened while"
                          " compiling the node", node, "\n") + e.args
                raise
+        t1 = time.time()
+
+        if self.profile:
+            self.profile.linker_node_make_thunks += t1 - t0
+
        for node, thunk in zip(order, thunks):
            thunk.inputs = [storage_map[v] for v in node.inputs]
            thunk.outputs = [storage_map[v] for v in node.outputs]

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -6,6 +6,7 @@ import pdb
 import time
 from six import iteritems
 from six.moves import xrange
+import sys

 import theano
 from theano import tensor, scalar, gof, config
@@ -13,7 +14,6 @@ from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, DB, toolbox, graph)
-from theano.gof.opt import NavigatorOptimizer
 from theano.ifelse import IfElse
 from theano.misc.ordered_set import OrderedSet

@@ -262,7 +262,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
                    0, 'fast_run', 'fast_compile', 'merge')


-class GraphToGPU(NavigatorOptimizer):
+class GraphToGPU(Optimizer):
    """
    Transfer the graph as a whole to GPU instead of transfering node by node.

@@ -485,6 +485,16 @@ class GraphToGPU(NavigatorOptimizer):
                node_created,
                process_count)

+    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
+        print("%s%s (%i)" % (
+            (' ' * level), self.__class__.__name__, id(self)), file=stream)
+        if depth != 0:
+            map_values = []
+            for opts in self.local_optimizers_map.values():
+                map_values += opts
+            for opt in self.local_optimizers_all + map_values:
+                opt.print_summary(stream, level=(level + 2), depth=(depth - 1))
+

 @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
 def local_cut_gpu_transfers(node):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -700,6 +700,8 @@ def local_gpu_solve(node):
    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)

    """
+    if node.outputs[0].dtype != 'float32':
+        return
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if (host_input.owner and