Merge pull request #2533 from nouiz/cleanup

Code simplification/small speed up for downsample

Merge pull request #2533 from nouiz/cleanup
6be35ca3 · Pascal Lamblin · 6f346499 · 6b4c592f · 6be35ca3 · 6be35ca3
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -1676,8 +1676,10 @@ class OpKeyOptimizer(NavigatorOptimizer):
 class ChangeTracker:
    def __init__(self):
        self.changed = False
+        self.nb_imported = 0

    def on_import(self, fgraph, node, reason):
+        self.nb_imported += 1
        self.changed = True

    def on_change_input(self, fgraph, node, i, r, new_r, reason):
@@ -1742,13 +1744,14 @@ class EquilibriumOptimizer(NavigatorOptimizer):

    def add_requirements(self, fgraph):
        super(EquilibriumOptimizer, self).add_requirements(fgraph)
-        fgraph.attach_feature(ChangeTracker())
        for opt in self.get_local_optimizers():
            opt.add_requirements(fgraph)
        for opt in self.global_optimizers:
            opt.add_requirements(fgraph)

    def apply(self, fgraph, start_from=None):
+        change_tracker = ChangeTracker()
+        fgraph.attach_feature(change_tracker)
        if start_from is None:
            start_from = fgraph.outputs
        else:
@@ -1769,9 +1772,11 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        time_opts = {}
        io_toposort_timing = []
        nb_nodes = []
+        node_created = {}
        for opt in self.global_optimizers + list(self.get_local_optimizers()):
            global_process_count.setdefault(opt, 0)
            time_opts.setdefault(opt, 0)
+            node_created.setdefault(opt, 0)

        while changed and not max_use_abort:
            process_count = {}
@@ -1780,15 +1785,17 @@ class EquilibriumOptimizer(NavigatorOptimizer):

            #apply global optimizers
            for gopt in self.global_optimizers:
-                fgraph.change_tracker.reset()
+                change_tracker.reset()
+                nb = change_tracker.nb_imported
                t_opt = time.time()
                gopt.apply(fgraph)
                time_opts[gopt] += time.time() - t_opt
-                if fgraph.change_tracker.changed:
+                if change_tracker.changed:
                    process_count.setdefault(gopt, 0)
                    process_count[gopt] += 1
                    global_process_count[gopt] += 1
                    changed = True
+                    node_created[gopt] += change_tracker.nb_imported - nb
                    if global_process_count[gopt] > max_use:
                        max_use_abort = True
                        opt_name = (getattr(gopt, "name", None)
@@ -1825,6 +1832,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    for lopt in (self.local_optimizers_all +
                                 self.local_optimizers_map.get(type(node.op), []) +
                                 self.local_optimizers_map.get(node.op, [])):
+                        nb = change_tracker.nb_imported
                        t_opt = time.time()
                        lopt_change = self.process_node(fgraph, node, lopt)
                        time_opts[lopt] += time.time() - t_opt
@@ -1833,6 +1841,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                            process_count[lopt] += 1
                            global_process_count[lopt] += 1
                            changed = True
+                            node_created[lopt] += change_tracker.nb_imported - nb
                            if global_process_count[lopt] > max_use:
                                max_use_abort = True
                                opt_name = (getattr(lopt, "name", None)
@@ -1853,10 +1862,11 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                          + ". You can safely raise the current threshold of "
                          + "%f with the theano flag 'optdb.max_use_ratio'." %
                          config.optdb.max_use_ratio)
-
+        fgraph.remove_feature(change_tracker)
        return (self, loop_timing, loop_process_count,
                (start_nb_nodes, end_nb_nodes, max_nb_nodes),
-                global_opt_timing, nb_nodes, time_opts, io_toposort_timing)
+                global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
+                node_created)

    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
        name = getattr(self, 'name', None)
@@ -1871,7 +1881,8 @@ class EquilibriumOptimizer(NavigatorOptimizer):
    def print_profile(stream, prof, level=0):
        (opt, loop_timing, loop_process_count,
         (start_nb_nodes, end_nb_nodes, max_nb_nodes),
-         global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof
+         global_opt_timing, nb_nodes, time_opts, io_toposort_timing,
+         node_created) = prof

        blanc = ('    ' * level)
        print >> stream, blanc, "EquilibriumOptimizer",
@@ -1915,18 +1926,19 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                process_count[o] += v
        for opt, count in process_count.iteritems():
            if count > 0:
-                count_opt.append((time_opts[opt], count, opt))
+                count_opt.append((time_opts[opt], count,
+                                  node_created[opt], opt))
            else:
                not_used.append((time_opts[opt], opt))
                not_used_time += time_opts[opt]

        if count_opt:
            print >> stream, blanc, \
-                    '  times - times applied - name:'
+                    '  times - times applied - nb node created - name:'
            count_opt.sort()
-            for (t, count, opt) in count_opt[::-1]:
-                print >> stream, blanc, '  %.3fs - %d - %s' % (
-                    t, count, opt)
+            for (t, count, n_created, opt) in count_opt[::-1]:
+                print >> stream, blanc, '  %.3fs - %d - %d - %s' % (
+                    t, count, n_created, opt)
            print >> stream, blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used))
            not_used.sort()

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -304,7 +304,11 @@ def local_gpu_elemwise_1(node):
 def local_gpu_split(node):
    if isinstance(node.op, tensor.Split):
        input = node.inputs[0]
-        if input.owner and isinstance(input.owner.op, HostFromGpu):
+        outs_clients = reduce(list.__add__,
+                              [out.clients for out in node.outputs])
+        if (input.owner and isinstance(input.owner.op, HostFromGpu) or
+            any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
+                 in outs_clients])):
            new_op = GpuSplit(node.op.len_splits)
            split_res = new_op(gpu_from_host(input), *node.inputs[1:])
            return [host_from_gpu(o) for o in split_res]

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -289,7 +289,7 @@ def test_local_gpu_subtensor():
    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])


-def test_local_split():
+def test_local_gpu_split():
    """ Test that the GpuSplit op is being applied and works """
    # Construct symbolic split
    x = tensor.fvector()
@@ -310,6 +310,17 @@ def test_local_split():
    # Check equality
    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])

+    # Test the other path of the optimizer, when it is the output that
+    # is moved to the GPU.
+    ra = cuda.gpu_from_host(ra)
+    f = theano.function([x, splits], [ra, rb, rc],
+                        mode=mode_with_gpu.excluding("InputToGpuOptimizer"))
+    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+    l = f.maker.fgraph.toposort()
+    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+    # Check equality
+    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+

 def test_print_op():
    """ Test that print ops don't block gpu optimization"""

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -197,13 +197,11 @@ class DownsampleFactorMax(Op):
                'DownsampleFactorMax requires 4D input for now')
        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st)
        if (z[0] is None) or (z[0].shape != z_shape):
-            z[0] = numpy.zeros(self.out_shape(x.shape, self.ds,
-                                              self.ignore_border, self.st))
-            z[0] = theano._asarray(z[0], dtype=x.dtype)
+            z[0] = numpy.empty(self.out_shape(x.shape, self.ds,
+                                              self.ignore_border, self.st),
+                               dtype=x.dtype)
        zz = z[0]

-        ## zz needs to be initialized with -inf for the following to work
-        zz -= numpy.inf
        #number of pooling output rows
        pr = zz.shape[-2]
        #number of pooling output cols
@@ -221,11 +219,8 @@ class DownsampleFactorMax(Op):
                    for c in xrange(pc):
                        col_st = c * st1
                        col_end = __builtin__.min(col_st + ds1, img_cols)
-                        for row_ind in xrange(row_st, row_end):
-                            for col_ind in xrange(col_st, col_end):
-                                zz[n, k, r, c] = \
-                                    __builtin__.max(zz[n, k, r, c],
-                                                    x[n, k, row_ind, col_ind])
+                        zz[n, k, r, c] = x[
+                            n, k, row_st:row_end, col_st:col_end].max()

    def infer_shape(self, node, in_shapes):
        shp = self.out_shape(in_shapes[0], self.ds,
@@ -594,8 +589,8 @@ class DownsampleFactorMaxGradGrad(Op):
        z_shape = self.out_shape(x.shape, self.ds, self.ignore_border, self.st)
        if (z[0] is None) or (z[0].shape != z_shape):
            z[0] = numpy.zeros(self.out_shape(x.shape, self.ds,
-                                              self.ignore_border, self.st))
-            z[0] = theano._asarray(z[0], dtype=x.dtype)
+                                              self.ignore_border, self.st),
+                               dtype=x.dtype)
        ggz = z[0]

        #number of pooling output rows