Merge pull request #1756 from nouiz/mixed

Fix gpu crash and faster optimization

Merge pull request #1756 from nouiz/mixed
70e25931 · abergeron · 85209fbb · 745b5559 · 70e25931 · 70e25931
--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -18,7 +18,7 @@ those operations will run in parallel in Theano.
 The most frequent way to control the number of threads used is via the
 ``OMP_NUM_THREADS`` environment variable. Set it to the number of
 threads you want to use before starting the python process. Some BLAS
-implementation support other enviroment variable.
+implementations support other enviroment variables.
 Parallel element wise ops with OpenMP
@@ -27,8 +27,8 @@ Parallel element wise ops with OpenMP
 Because element wise ops work on every tensor entry independently they
 can be easily parallelized using OpenMP.
-To use OpenMP you must set the ``openmp`` flag to ``True`` in Theano
+To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
-configuration.
+to ``True``.
 You can use the flag ``openmp_elemwise_minsize`` to set the minimum
 tensor size for which the operation is parallelized because for short

--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
    LocalOptimizer, local_optimizer, LocalOptGroup,
    OpSub, OpRemove, PatternSub,
    NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
-    InplaceOptimizer, PureThenInplaceOptimizer,
    OpKeyOptimizer)
 from theano.gof.optdb import \

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
    def __call__(self, *args, **kwargs):
        return self.fn(*args, **kwargs)
+    def __str__(self):
+        return self.__name__
 def optimizer(f):
    """decorator for FromFunctionOptimizer"""
@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
        print >> stream, blanc, "  replace_time", replace_time
        print >> stream, blanc, "  validate_time", validate_time
        print >> stream, blanc, "  callback_time", callback_time
-        print >> stream, blanc, "  callback_times", callbacks_time
+        print >> stream, blanc, "  callbacks_time"
+        for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
        print >> stream, blanc, "  nb_merged", nb_merged
        print >> stream, blanc, "  nb_constant", nb_constant
@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
    def __init__(self,
                 optimizers,
                 failure_callback=None,
-                 max_depth=None,
                 max_use_ratio=None):
        """
        :param optimizers:  list or set of local or global optimizations to
@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        :param max_use_ratio: each optimizer can be applied at most
            (size of graph * this number) times
-        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
        """
        super(EquilibriumOptimizer, self).__init__(
@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        self.local_optimizers_map.setdefault(c, []).append(opt)
            else:
                self.global_optimizers.append(opt)
-        self.max_depth = max_depth
        self.max_use_ratio = max_use_ratio
        assert self.max_use_ratio is not None, (
                'max_use_ratio has to be a number')
@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, opt) in count_opt[::-1]:
                print >> stream, blanc, '  %.3fs - %d - %s' % (
                    t, count, opt)
-            print >> stream, blanc, '  %.3fs - in %d optimization that where not used' % (
+            print >> stream, blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used))
            not_used.sort()
            for (t, opt) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
                    print >> stream, blanc + "  ", '  %.3fs - %s' % (t, opt)
            print >> stream
@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
    final_outs, optimized_nodes = local_recursive_function(
        list_optimizations, out, {}, 0)
    return final_outs[out_index]
-############
-### Misc ###
-############
-class InplaceOptimizer(Optimizer):
-    def __init__(self, inplace):
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.inplace(fgraph)
-    def add_requirements(self, fgraph):
-        fgraph.attach_feature(dh.DestroyHandler())
-class PureThenInplaceOptimizer(Optimizer):
-    def __init__(self, pure, inplace):
-        self.pure = pure
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.pure(fgraph)
-        fgraph.attach_feature(dh.DestroyHandler())
-        self.inplace(fgraph)
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        return opt.EquilibriumOptimizer(opts,
-                max_depth=5,
                max_use_ratio=config.optdb.max_use_ratio,
                failure_callback=opt.NavigatorOptimizer.warn_inplace)

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 20)
+        return (0, 21)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        (version==3||version==4||version==5||version==-1) &&
        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+        (kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
        !work_complete) //conv_full_patch_stack_padded
    {
      //version 3 without split

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,7 +14,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler)
+                        Optimizer, toolbox)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
 # inside the elemwise. When there is no float64 op, this is working.
 optdb.register('gpu_after_fusion',
               ProxyDB(gpu_seqopt),
-               optdb.__position__.get('elemwise_fusion', 71) + .1,
+               optdb.__position__.get('elemwise_fusion', 49) + .1,
               'gpu')
@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:
@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
        max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    #Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
    optdb.register('gpu_elemwise_fusion',
                   tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
-                   71.00, 'fast_run', 'fusion',
+                   49, 'fast_run', 'fusion',
                   'local_elemwise_fusion', 'gpu')
 else:
    _logger.debug(("not enabling optimization fusion of gpu elemwise in "

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -679,6 +679,7 @@ def test_full():
            #Test more than maxThreadsDim0
            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
            ]
 #    shapes=shapes[:277]

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -5,7 +5,7 @@ from theano import tensor, scalar
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler,
+                        Optimizer, toolbox,
                        InconsistencyError, EquilibriumOptimizer)
 from theano.gof.python25 import all, any
@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -139,7 +139,7 @@ except ImportError:
    pass
 from theano.configparser import config, AddConfigVar, StrParam
-from theano.gof import (utils, Op, view_roots, DestroyHandler,
+from theano.gof import (utils, Op, view_roots,
                        local_optimizer, Optimizer,
                        InconsistencyError, toolbox, SequenceDB,
                        EquilibriumOptimizer, Apply,
@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        did_something = True
@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
        time_factor_can = 0
        time_factor_list = 0
        time_toposort = 0
+        if fgraph.profile:
+            validate_before = fgraph.profile.validate_time
+            callbacks_before = fgraph.execute_callbacks_times.copy()
+            callback_before = fgraph.execute_callbacks_time
+        class Updater:
+            def on_import(self, fgraph, new_node, reason):
+                if new_node is not node:
+                    nodelist.append(new_node)
+        u = Updater()
+        fgraph.attach_feature(u)
        while did_something:
+            nb_iter += 1
            t0 = time.time()
-            nodelist = list(fgraph.toposort())
+            nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
            time_toposort += time.time() - t0
            did_something = False
            nodelist.reverse()
@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
                    except ReplacementDidntRemovedError, e:
                        nb_replacement_didn_t_remove += 1
                        self.warned = True
-            nb_iter += 1
+        fgraph.remove_feature(u)
+        if fgraph.profile:
+            validate_time = fgraph.profile.validate_time - validate_before
+            callback_time = fgraph.execute_callbacks_time - callback_before
+            callbacks_time = {}
+            for k, v in fgraph.execute_callbacks_times.iteritems():
+                if k in callbacks_before:
+                    callbacks_time[k] = v - callbacks_before[k]
+                else:
+                    callbacks_time[k] = v
+        else:
+            validate_time = None
+            callback_time = None
+            callbacks_time = {}
        return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
                nb_inconsistency_make, nb_inconsistency_replace,
                time_canonicalize, time_factor_can,
-                time_factor_list, time_toposort)
+                time_factor_list, time_toposort,
+                validate_time, callback_time, callbacks_time,)
    @staticmethod
    def print_profile(stream, prof, level=0):
        blanc = ('    ' * level)
-        #1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
        print >> stream, blanc, "GemmOptimizer"
        print >> stream, blanc, " nb_iter", prof[1]
        print >> stream, blanc, " nb_replacement", prof[2]
@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
        print >> stream, blanc, " time_factor_can", prof[7]
        print >> stream, blanc, " time_factor_list", prof[8]
        print >> stream, blanc, " time_toposort", prof[9]
+        print >> stream, blanc, " validate_time", prof[10]
+        print >> stream, blanc, " callback_time", prof[11]
+        print >> stream, blanc, " callbacks_time"
+        for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
 class Dot22(GemmRelated):
@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
        15, 'fast_run')
-# After destroyhandler is in but before we try to make elemwise things inplace
+# After destroyhandler(49.5) but before we try to make elemwise things
-# Try to make gemm inplace
+# inplace (75)
-# Also, need to make the gemm optimisation(step 70) happen before the
-# fusion of elemwise(step 71)
 blas_opt_inplace = in2out(local_inplace_gemm,
                          local_inplace_gemv,
                          local_inplace_ger,
                          name="blas_opt_inplace")
 optdb.register('InplaceBlasOpt',
               blas_opt_inplace,
-        70.0, 'fast_run', 'inplace')
+               70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
 class Dot22Scalar(GemmRelated):

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py