Merge pull request #2473 from nouiz/profile_import

Profile import and crash fix

Merge pull request #2473 from nouiz/profile_import
cd3979e5 · abergeron · 973573df · e77cc747 · cd3979e5 · cd3979e5
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1410,6 +1410,7 @@ class FunctionMaker(object):
        # Get a function instance
        start_linker = time.time()
+        start_import_time = theano.gof.cmodule.import_time
        add_stack_trace_on_call_orig = gof.Op.add_stack_trace_on_call
        limit_orig = theano.config.traceback.limit
        try:
@@ -1428,6 +1429,8 @@ class FunctionMaker(object):
        if self.profile:
            self.profile.linker_time += linker_time
            _fn.time_thunks = self.profile.flag_time_thunks
+            import_time = theano.gof.cmodule.import_time - start_import_time
+            self.profile.import_time += import_time
        fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs,
                defaults, self.unpack_single, self.return_none, self)

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -104,7 +104,7 @@ def _atexit_print_fn():
        for ps in to_sum[1:]:
            for attr in ["compile_time", "fct_call_time", "fct_callcount",
                         "vm_call_time", "optimizer_time", "linker_time",
-                         "validate_time"]:
+                         "validate_time", "import_time"]:
                setattr(cum, attr, getattr(cum, attr) + getattr(ps, attr))
            # merge dictonary
@@ -194,6 +194,9 @@ class ProfileStats(object):
    linker_time = 0.0
    # time spent linking graph (FunctionMaker.create)
+    import_time = 0.0
+    # time spent in importing compiled python module.
    line_width = config.profiling.output_line_width
    optimizer_profile = None
@@ -640,6 +643,7 @@ class ProfileStats(object):
        print >> file, ('    Theano Linker time (includes C,'
                        ' CUDA code generation/compiling): %es' %
                        self.linker_time)
+        print >> file, '       Import time %es' % self.import_time
        print >> file, ''
        # The validation time is a subset of optimizer_time

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -71,6 +71,8 @@ _logger = logging.getLogger("theano.gof.cmodule")
 METH_VARARGS = "METH_VARARGS"
 METH_NOARGS = "METH_NOARGS"
+# global variable that represent the total time spent in importing module.
+import_time = 0
 class MissingGXX(Exception):
@@ -282,11 +284,15 @@ def dlimport(fullpath, suffix=None):
    _logger.debug("module_name %s", module_name)
    sys.path[0:0] = [workdir]  # insert workdir at beginning (temporarily)
+    global import_time
    try:
        if importlib is not None:
            if hasattr(importlib, "invalidate_caches"):
                importlib.invalidate_caches()
+        t0 = time.time()
        rval = __import__(module_name, {}, {}, [module_name])
+        t1 = time.time()
+        import_time += t1 - t0
        if not rval:
            raise Exception('__import__ failed', fullpath)
    finally:

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -318,7 +318,8 @@ class Container(object):
        else:
            self.type = r.type
        if name is None:
-            self.name = r.name
+            # Some Type do not have a name field.
+            self.name = getattr(r, 'name', None)
        else:
            self.name = name
@@ -730,9 +731,9 @@ class WrapLinker(Linker):
            wrapper=self.wrapper)
        return other
-    def clone(allow_gc=undef):
+    def clone(self, allow_gc=undef):
        return self.__class__(
-            linkers=[l.clone(allow_gc=allow_gc)],
+            linkers=[l.clone(allow_gc=allow_gc) for l in self.linkers],
            wrapper=self.wrapper)
    def accept(self, fgraph, no_recycling=None):

--- a/theano/gof/utils.py
+++ b/theano/gof/utils.py
@@ -43,7 +43,7 @@ def simple_extract_stack(f=None, limit=None):
    list.reverse()
    return list
-if sys.version_info[:2] <= (3, 2):
+if sys.version_info[:2] > (3, 4):
    # I enable my implementation only for some python version just to
    # be sure the Python internal do not change. If this work with
    # other python version, you can enable it.

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -205,7 +205,7 @@ if __name__ == "__main__":
        gpu
        K6000/NOECC       0.06s         0.06s
        K40                             0.07s
-        K20m/ECC                0.08s          0.07s
+        K20m/ECC          0.08s 0.08s          0.07s
        K20/NOECC                              0.07s
        M2090                           0.19s
        C2075                                         0.25s
@@ -233,6 +233,7 @@ if __name__ == "__main__":
        GTX 460                                0.37s                0.45s
        GTX 285                         0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
        750M                                   0.49s
+        GT 610            2.38s
        GTX 550 Ti                                                  0.57s
        GT 520                                        2.68s                3.06s
        520M                                   2.44s                       3.19s        # with bumblebee on Ubuntu 12.04

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -417,8 +417,8 @@ class GpuDnnConv(DnnBase, COp):
        return [(
            b, nb,
-            (h + 2*padh - kh)/sh + 1,
+            (h + 2*padh - kh)//sh + 1,
-            (w + 2*padw - kw)/sw + 1
+            (w + 2*padw - kw)//sw + 1
        )]
@@ -731,8 +731,8 @@ class GpuDnnPool(DnnBase):
        return [(
            shape[0][0],
            shape[0][1],
-            (shape[0][2] - kh)/sh + 1,
+            (shape[0][2] - kh)//sh + 1,
-            (shape[0][3] - kw)/sw + 1
+            (shape[0][3] - kw)//sw + 1
        )]
    def c_support_code_struct(self, node, name):