Cleaned up and fixed pep8

d0dfb0be · sentient07 · c3e8f153 · d0dfb0be · d0dfb0be · d0dfb0be
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -25,7 +25,7 @@ from theano.tensor.signal.pool import (
 from . import pygpu
 from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
-                        gpu_contiguous, GpuAllocEmpty, gpu_alloc_empty, 
+                        gpu_contiguous, gpu_alloc_empty,
                        empty_like)
 from .elemwise import GpuElemwise
@@ -942,8 +942,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
        out = gpu_alloc_empty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
-                                                 shape_i(kerns, 1, fgraph),
+                                                   shape_i(kerns, 1, fgraph),
-                                                 shape2, shape3)
+                                                   shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode, precision=precision)(kerns.shape)
        return gpu_dnn_conv_gradI()(kerns, img, out, desc)
@@ -1412,11 +1412,11 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
 @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs])
 @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
-                  AbstractConv2d_gradInputs], 'fast_compile')
+                AbstractConv2d_gradInputs], 'fast_compile')
 def local_abstractconv_cudnn_graph(op, context_name, inputs):
    if (not isinstance(op, (AbstractConv2d,
-                                 AbstractConv2d_gradWeights,
+                            AbstractConv2d_gradWeights,
-                                 AbstractConv2d_gradInputs))):
+                            AbstractConv2d_gradInputs))):
        return None
    inp1 = inputs[0]
@@ -1462,8 +1462,8 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs):
 @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs])
 def local_abstractconv_cudnn(node):
-    ctx = infer_context(*node.inputs)
+    ctx = infer_context_name(*node.inputs)
-    return local_abstractconv_dnn_graph(node.op, ctx, node.inputs)
+    return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs)
 conv_groupopt.register('local_abstractconv_cudnn_graph',
                       local_abstractconv_cudnn_graph, 20,

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -9,7 +9,7 @@ except ImportError:
    pass
 from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
-                        infer_context_name, GpuFromHost)
+                        infer_context_name)
 from .opt import register_opt, op_lifter, register_opt2
@@ -450,10 +450,11 @@ class GpuCumsum(GpuKernelBase, Op):
        """ % locals()
        return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
 @register_opt('fast_compile')
 @op_lifter([CumsumOp])
 @register_opt2([CumsumOp], 'fast_compile')
-def use_gpu_cumsumop(op, ctx_name, inputs,  ):
+def use_gpu_cumsumop(op, ctx_name, inputs):
    if inputs[0].dtype == 'float32':
        axis = op.axis
        x = inputs[0]

--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
 from . import opt
 from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
-                        infer_context_name)
+                        infer_context_name, gpu_alloc_empty)
 from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -8,15 +8,15 @@ from six import itervalues, iteritems
 from six.moves import xrange
 import theano
-from theano.compat import OrderedDict
 from theano import tensor, scalar, gof, config
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, DB, toolbox, graph)
-from theano.gof.opt import ChangeTracker, NavigatorOptimizer
+from theano.gof.opt import NavigatorOptimizer
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
+from theano.misc.ordered_set import OrderedSet
 from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt
@@ -30,7 +30,7 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
 from theano.tests.breakpoint import PdbBreakpoint
 from .type import (GpuArrayType, GpuArrayConstant, get_context,
-                   ContextNotDefined, GpuArrayVariable, GpuArraySharedVariable)
+                   ContextNotDefined, GpuArraySharedVariable, GpuArrayVariable)
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
@@ -55,8 +55,6 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -195,7 +193,7 @@ def op_lifter(OP, cuda_only=False):
                try:
                    new_op = maker(node.op, context_name, node.inputs)
                except TypeError:
-                    # Pass the outputs so that the Local Optimizers don't need to 
+                    # Pass the outputs so that the Local Optimizers don't need to
                    # build the nodes again.
                    new_op = maker(node.op, context_name, node.inputs, node.outputs)
                # This is needed as sometimes new_op inherits from OP.
@@ -263,7 +261,6 @@ class GraphToGPU(NavigatorOptimizer):
        self.local_optimizers_all = local_optimizers_all
        self.local_optimizers_map = local_optimizers_map
        self.failure_callback = None
-        self.new_opts = []
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
@@ -281,13 +278,10 @@ class GraphToGPU(NavigatorOptimizer):
    def apply(self, fgraph):
        mapping = {}
-        start_nb_nodes = len(fgraph.apply_nodes)
-        max_nb_nodes = len(fgraph.apply_nodes)
-        io_toposort_timing = []
-        nb_nodes = []
        time_opts = {}
        node_created = {}
        process_count = {}
+        io_toposort_timing = []
        # Building a new graph
        # Iterating through inputs of graph
        for i in fgraph.inputs:
@@ -299,7 +293,7 @@ class GraphToGPU(NavigatorOptimizer):
            if isinstance(i, theano.Constant):
                mapping[i] = i
        for node in fgraph.toposort():
-            for lopt in (self.local_optimizers_all + 
+            for lopt in (self.local_optimizers_all +
                         self.local_optimizers_map.get(type(node.op), []) +
                         self.local_optimizers_map.get(node.op, [])):
                    process_count.setdefault(lopt, 0)
@@ -307,29 +301,25 @@ class GraphToGPU(NavigatorOptimizer):
                    node_created.setdefault(lopt, 0)
        t_topo = time.time()
-        topo = fgraph.toposort()
+        fgraph.toposort()
        time_topo = time.time() - t_topo
+        io_toposort_timing.append(time_topo - t_topo)
        for node in fgraph.toposort():
-            t0 = time.time()
            if isinstance(node.op, HostFromGpu):
                mapping[node.outputs[0]] = node.inputs[0]
                continue
            # Move only if any of the inputs are on the GPU.
-            move_to_GPU = True
+            move_to_GPU = False
-            '''
            if any([isinstance(i, GpuArrayVariable) or
-                    isinstance(i, GpuArraySharedVariable)
+                   isinstance(i, GpuArraySharedVariable)
-                    for i in [mapping[v] for v in node.inputs] +
+                   for i in [mapping[v] for v in node.inputs] +
-                              node.outputs]):
+                   node.outputs]):
                move_to_GPU = True
-            '''
-            out_clients = [o.clients for o in node.outputs]
            context_name = None
            for i in [mapping[i] for i in node.inputs]:
@@ -340,29 +330,28 @@ class GraphToGPU(NavigatorOptimizer):
            new_ops = None
            outputs = []
-            ex_opt_time = None
            # Apply the lifter
            for lopt in (self.local_optimizers_all +
                         self.local_optimizers_map.get(type(node.op), []) +
                         self.local_optimizers_map.get(node.op, [])):
-                process_count[lopt] += 1
                if move_to_GPU:
                    t_opt = time.time()
                    try:
-                        new_ops = lopt.transform(
+                        new_ops = lopt.transform(node.op, context_name,
-                            node.op, context_name,
+                                                 [mapping[i] for i in node.inputs])
-                            [mapping[i] for i in node.inputs])
                    except TypeError:
-                        # Updating again because else we'd be counting 
+                        # Updating again because else we'd be counting
                        # time for two except clauses
                        t_opt = time.time()
-                        new_ops = lopt.transform(node.op, context_name, 
+                        new_ops = lopt.transform(node.op, context_name,
-                            [mapping[i] for i in node.inputs],
+                                                 [mapping[i] for i in node.inputs],
-                            node.outputs)
+                                                 node.outputs)
                    finally:
                        t_opt2 = time.time()
+                        time_opts[lopt] += t_opt2 - t_opt
                    if new_ops:
+                        process_count[lopt] += 1
                        break
            if not new_ops:
                newnode = node.clone_with_new_inputs([mapping.get(i)
@@ -385,9 +374,7 @@ class GraphToGPU(NavigatorOptimizer):
                                  return_list=True)
            if new_ops:
-                node_created[lopt] += len(theano.gof.graph.ops([mapping[i] for i in node.inputs], outputs))
+                node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
-                self.new_opts.append(lopt)
-                time_opts[lopt] = t_opt2 - t_opt
            for new_o, old_o in zip(outputs, node.outputs):
                mapping[old_o] = new_o
@@ -402,47 +389,26 @@ class GraphToGPU(NavigatorOptimizer):
            new_nodes.append(new_o)
        fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes))
-        end_nb_nodes = len(fgraph.apply_nodes)
+        return (self, io_toposort_timing, time_opts, node_created, process_count)
-        return (self, start_nb_nodes, end_nb_nodes, max_nb_nodes, io_toposort_timing,
-                nb_nodes, time_opts, node_created)
    @staticmethod
    def print_profile(stream, prof, level=0):
-        (opt, start_nb_nodes, end_nb_nodes, max_nb_nodes, io_toposort_timing,
+        (opt, io_toposort_timing, time_opts, node_created, process_count) = prof
-         nb_nodes, time_opts, node_created) = prof
        blanc = ('    ' * level)
        print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
        print(blanc, getattr(opt, "name",
                             getattr(opt, "__name__", "")), file=stream)
-        print(blanc, "  nb nodes (start, end,  max) %d %d %d" % (
-            start_nb_nodes, end_nb_nodes, max_nb_nodes), file=stream)
        print(blanc, "  time io_toposort %.3fs" % sum(
            io_toposort_timing), file=stream)
-        s = sum([time_opts[o] for o in opt.new_opts])
+        s = sum([v for k, v in time_opts.iteritems()])
+        print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
-        print(blanc, "  time in local optimizers %.3fs" % s, file=stream)
-        # Build a dictionary of opt and time taken
-        opt_time_dict = dict()
-        for o in opt.new_opts:
-            if o not in opt_time_dict:
-                opt_time_dict[o] = time_opts[o]
-            else:
-                opt_time_dict[o] += time_opts[o]
-        # print time per each optimizer
-        for k,v in opt_time_dict.iteritems():
-            print(blanc, "Local Optimizer :" + str(k) + " takes time : %.3f" %v, file=stream)
        count_opt = []
        not_used = []
        not_used_time = 0
-        process_count = {}
-        for o in (opt.new_opts):
-            process_count.setdefault(o, 0)
-            process_count[o] + 1
        for o, count in iteritems(process_count):
            if count > 0:
@@ -454,13 +420,13 @@ class GraphToGPU(NavigatorOptimizer):
        if count_opt:
            print(blanc,
-                  '  times - times applied - nb node created - name:',
+                  '  times - times applied - Node created - name:',
                  file=stream)
            count_opt.sort()
            for (t, count, n_created, o) in count_opt[::-1]:
                print(blanc, '  %.3fs - %d - %d - %s' % (
                    t, count, n_created, o), file=stream)
-            print(blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
+            print(blanc, '  %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used)), file=stream)
            not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
            for (t, o) in not_used[::-1]:
@@ -469,7 +435,6 @@ class GraphToGPU(NavigatorOptimizer):
                    print(blanc + "  ", '  %.3fs - %s' % (t, o), file=stream)
            print(file=stream)
    @staticmethod
    def merge_profile(prof1, prof2):
        # (opt, loop_timing, loop_process_count, max_nb_nodes,
@@ -491,8 +456,7 @@ class GraphToGPU(NavigatorOptimizer):
        local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
                                          prof2[0].local_optimizers_map)
+        new_opt = GraphToGPU(local_optimizers, local_optimizers_map)
-        new_opt = GraphToGPU(local_optimizers,local_optimizers_map)
        def merge_list(l1, l2):
            l = copy.copy(l1)
@@ -501,23 +465,17 @@ class GraphToGPU(NavigatorOptimizer):
                    l[idx] += nb
                else:
                    l.append(nb)
-            return l     
+            return l
-        max_nb_nodes = max(prof1[3], prof2[3])
+        io_toposort_timing = merge_list(prof1[1], prof2[1])
+        time_opts = merge_dict(prof1[2], prof2[2])
-        io_toposort_timing = merge_list(prof1[4], prof2[4])
+        node_created = merge_dict(prof1[3], prof2[3])
+        process_count = merge_dict(prof1[4], prof2[4])
-        nb_nodes = merge_list(prof1[5], prof2[5])
-        time_opts = merge_dict(prof1[6], prof2[6])
-        node_created = merge_dict(prof1[7], prof2[7])
        return (new_opt,
-                max_nb_nodes,
+                io_toposort_timing,
-                io_toposort_timing, 
+                time_opts,
-                nb_nodes,
+                node_created,
-                time_opts,  
+                process_count)
-                node_created)
 @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
@@ -917,7 +875,7 @@ def local_gpuajoin_1(node):
 @op_lifter([tensor.Split])
 @register_opt2([tensor.Split], 'fast_compile')
 def local_gpua_split(op, context_name, inputs):
-#TODO use props
+    # TODO use props
    return GpuSplit(op.len_splits)
@@ -1009,7 +967,7 @@ def local_advincsub1_gpua_inplace(node):
 @register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
 def local_gpua_careduce(op, context_name, inputs, outputs):
    if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
-                                      scalar.Maximum, scalar.Minimum)):
+                                 scalar.Maximum, scalar.Minimum)):
        ctx = get_context(context_name)
        if ctx.kind == b'opencl':
@@ -1233,7 +1191,6 @@ def local_assert(op, context_name, inputs):
                             *inputs[1:]))]
 @register_opt('fast_compile')
 @op_lifter([ConvOp])
 def local_error_convop(op, context_name, inputs):

--- a/theano/gpuarray/opt_util.py
+++ b/theano/gpuarray/opt_util.py
@@ -325,7 +325,7 @@ def inplace_allocempty(op, idx):
                    isinstance(alloc.owner.op, GpuAllocEmpty) and
                    len(alloc.clients) > 1):
                alloc_op = gpu_alloc_empty(alloc.owner.op.dtype,
-                                         alloc.owner.op.context_name)
+                                           alloc.owner.op.context_name)
                inputs[idx] = alloc_op(*alloc.owner.inputs)
            return maker(node, inputs)
        return opt