Merge pull request #1570 from nouiz/flops

Flops

Merge pull request #1570 from nouiz/flops
55c0b9e6 · Pascal Lamblin · edcf97f3 · 913cd137 · 55c0b9e6 · 55c0b9e6
--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -330,6 +330,16 @@ following methods:
   shape without computing the output itself, potentially sparing you
   a costly recomputation.
+.. function:: flops(inputs, outputs)
+   Optional.
+   It is only used to have more information printed by the memory
+   profiler.  It make it print the mega flops and giga flops per
+   second for each apply node. It take as inputs two list: one for the
+   inputs and one for the outputs. They contain one tuple with the
+   shape of the corresponding inputs/outputs.
 .. function:: make_thunk(node, storage_map, compute_map, no_recycling)
   TODO

--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -35,6 +35,16 @@ probably do something similar on older computer.
   this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
   raise this error.
+.. note::
+   This page describe how to install Theano for python 2. If you have
+   installed python3 on your system, maybe you need to change the
+   command pip to ``pip-2.7`` to specify to install it for python2, as
+   sometimes the pip command refer to the python 3 version.
+   The development version of Theano support python 3.3 and we
+   probably support python 3.2, but don't test on it.
 Installation steps
 ~~~~~~~~~~~~~~~~~~

--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
@@ -86,7 +86,10 @@ Op Contract
        def R_op(self, inputs, eval_points):
            pass
-        def infer_shape(node, (i0_shapes, ...))
+        def infer_shape(node, (i0_shapes, ...)):
+            pass
+        def flops(self, inputs, outputs):
            pass
 .. ../extending/op.txt
@@ -116,6 +119,11 @@ The :func:`infer_shape` method allows to infer the shape of some variable, somew
 middle of the computational graph without actually computing the outputs (when possible).
 This could be helpful if one only needs the shape of the output instead of the actual outputs.
+The :func:`flops` method allows to have the number of mega flops and
+giga flops per second printed by the memory profiler. It take as
+inputs two list: one for the inputs and one for the outputs. They
+contain one tuple with the shape of the corresponding inputs/outputs.
 The :func:`grad` method is required if you want to differentiate some cost whose expression
 includes your op.

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -292,32 +292,6 @@ class ProfileStats(object):
                rval[node.op] = 'Py'
        return rval
-    def op_flops(self):
-        """dict op -> total number of flops"""
-        # timing is stored by node, we compute timing by Op on demand
-        rval = {}
-        return rval  # TODO: continue here
-        for node, count in self.apply_callcount.items():
-            rval.setdefault(node.op, 0)
-            rval[node.op] += 1
-        return rval
-        for a, t in self.op_time.items():
-            if hasattr(a, 'flops'):
-                op_flops[a] = a.flops * op_call[a] / t / 1e6
-        flops_msg = ''
-        if op_flops:
-            flops_msg = ' <MFlops/s>'
-            print ('\nHACK WARNING: we print the flops for some OP, but the'
-                   ' logic does not always work. You need to know the internal'
-                   ' of Theano to make it work correctly.'
-                   ' Otherwise don\'t use!')
-        print ('\nOp-wise summary:'
-               ' <%% of local_time spent on this kind of Op>'
-               ' <cumulative %%> <self seconds> <cumulative seconds>'
-               ' <time per call> %s <nb_call> <nb apply> <Op name>' % (
-                flops_msg))
    def summary_class(self, file=sys.stderr, N=None):
        if self.apply_time:
            local_time = sum(self.apply_time.values())
@@ -330,7 +304,6 @@ class ProfileStats(object):
        class_time = self.class_time()
        class_call = self.class_callcount()
        class_apply = self.class_nodes()
-#        class_flops = self.class_flops()
        class_impl = self.class_impl()
        if N is None:
            N = len(self.class_time)
@@ -395,12 +368,6 @@ class ProfileStats(object):
            # While this carries over less information, it is arranged such
            # that it way more readeable that the previous output of the
            # profiler
-            #if op_flops:
-            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
-            #            f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
-            #else:
-            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
-            #            f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
        print >>file, '   ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
                % (max(0, len(otimes) - N),
                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
@@ -419,10 +386,7 @@ class ProfileStats(object):
        op_time = self.op_time()
        op_call = self.op_callcount()
        op_apply = self.op_nodes()
-        op_flops = self.op_flops()
        op_impl = self.op_impl()
-        if N is None:
-            N = len(self.op_flops)
        otimes = [(t * 100 / local_time,
                    t,
                    op,
@@ -484,12 +448,6 @@ class ProfileStats(object):
            # While this carries over less information, it is arranged such
            # that it way more readeable that the previous output of the
            # profiler
-            #if op_flops:
-            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
-            #            f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
-            #else:
-            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
-            #            f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
        print >>file, '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
                % (max(0, len(otimes) - N),
                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
@@ -532,6 +490,10 @@ class ProfileStats(object):
        hs += ['<id>']
        es += ['%3d']
+        es += ['%s', '%s']
+        if self.variable_shape:
+            hs += ['<Mflops>', '<Gflops/s>']
        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
        maxlen = self.line_width - upto_length
        hs += ['<Apply name>']
@@ -557,8 +519,22 @@ class ProfileStats(object):
            ftot = tot * 100 / local_time
            if nb_call == 0:
                continue
+            if not self.variable_shape:
+                flops = ""
+                flops_s = ""
+            elif hasattr(a.op, 'flops'):
+                fl = a.op.flops([self.variable_shape[var]
+                                 for var in a.inputs],
+                                [self.variable_shape[var]
+                                 for var in a.outputs])
+                flops = '%8.1f' % (fl/1024./1024)
+                flops_s = '%10.1f' % (fl/1024./1024/1024/t)
+            else:
+                flops = "        "
+                flops_s = "          "
            print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
                                        nd_id,
+                                        flops, flops_s,
                                        str(a)[:maxlen])
            if not config.profile_memory:
                continue

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -838,11 +838,17 @@ class VM_Linker(link.LocalLinker):
        for k in storage_map:
            compute_map[k] = [k.owner is None]
-        thunks = [node.op.make_thunk(node,
+        thunks = []
-                    storage_map,
+        for node in order:
-                    compute_map,
+            try:
-                    no_recycling)
+                thunks.append(node.op.make_thunk(node,
-                        for node in order]
+                                                 storage_map,
+                                                 compute_map,
+                                                 no_recycling))
+            except Exception, e:
+                e.args = ("The following error happened while"
+                          " compiling the node", node, "\n") + e.args
+                raise
        for node, thunk in zip(order, thunks):
            thunk.inputs = [storage_map[v] for v in node.inputs]
            thunk.outputs = [storage_map[v] for v in node.outputs]

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -621,6 +621,25 @@ class GpuConv(GpuOp):
                         False, False]
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
+    def flops(self, inputs, outputs):
+        """ Useful with the hack in profilemode to print the MFlops"""
+        images, kerns = inputs
+        out, = outputs
+        assert images[1] == kerns[1]
+        flops = 0
+        if self.out_mode == "valid":
+            # nb mul and add by output pixel
+            flops = kerns[2] * kerns[3] * 2
+            # nb flops by output image
+            flops *= out[2] * out[3]
+            # nb patch multiplied
+            flops *= images[1] * kerns[0] * images[0]
+        else:
+            flops = (images[0] * kerns[0] * images[1] *
+                     kerns[2] * kerns[3] *
+                     images[2] * images[3] * 2)
+        return flops
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        node_ = copy.copy(node)
        assert node.op is node_.op

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,8 +14,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler,
+                        Optimizer, toolbox, DestroyHandler)
-                        EquilibriumOptimizer)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
@@ -1199,12 +1198,10 @@ def local_inplace_ger(node):
 # Also, need to make the gemm optimisation(step 70) happen before the fusion of
 # elemwise(step 71)
 optdb.register('InplaceGpuBlasOpt',
-        EquilibriumOptimizer([local_inplace_gemm,
+               tensor.opt.in2out(local_inplace_gemm,
-                              local_inplace_gemv,
+                                 local_inplace_gemv,
-                              local_inplace_ger,
+                                 local_inplace_ger,
-                              ],
+                                 name="InplaceGpuBlasOpt"),
-                            failure_callback=EquilibriumOptimizer.warn_inplace,
-            max_use_ratio=5),
               70.0, 'fast_run', 'inplace', 'gpu')

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -869,5 +869,5 @@ def test_stack_rows_segfault_070312():
    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
            nkern=1, bsize=1)
-    f = theano.function([], [], updates=[(out, op(img, kern))])
+    f = theano.function([], [], updates=[(out, op(img, kern))], mode=theano_mode)
    f()
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -147,7 +147,7 @@ import theano.scalar
 from theano.tensor import basic as T
 from theano.tensor.blas_headers import blas_header_text
 from theano.tensor.blas_headers import blas_header_version
-from theano.tensor.opt import local_dimshuffle_lift
+from theano.tensor.opt import local_dimshuffle_lift, in2out
 _logger = logging.getLogger('theano.tensor.blas')
@@ -1777,10 +1777,10 @@ blas_optdb.register('local_gemm_to_gemv',
 # Try to make gemm inplace
 # Also, need to make the gemm optimisation(step 70) happen before the
 # fusion of elemwise(step 71)
-blas_opt_inplace = EquilibriumOptimizer(
+blas_opt_inplace = in2out(local_inplace_gemm,
-            [local_inplace_gemm, local_inplace_gemv, local_inplace_ger],
+                          local_inplace_gemv,
-            failure_callback=EquilibriumOptimizer.warn_inplace,
+                          local_inplace_ger,
-            max_use_ratio=5)
+                          name="blas_opt_inplace")
 optdb.register('InplaceBlasOpt',
        blas_opt_inplace,
        70.0, 'fast_run', 'inplace')

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -537,8 +537,6 @@ class ConvOp(OpenMPOp):
                          time_unroll_batch_kern)
        self._rehash()
-        if config.op.set_flops:
-            self.set_flops()
    def __eq__(self, other):
        if type(self) != type(other):
@@ -567,43 +565,24 @@ class ConvOp(OpenMPOp):
        return "ConvOp{" + ",".join(str((a, getattr(self, a)))
                                    for a in self.__attrnames) + "}"
-    def set_flops(self):
+    def flops(self, inputs, outputs):
        """ Useful with the hack in profilemode to print the MFlops"""
+        images, kerns = inputs
+        out, = outputs
+        assert images[1] == kerns[1]
+        flops = 0
        if self.out_mode == "valid":
-            # nb mul and add by output pixed
+            # nb mul and add by output pixel
-            self.flops = self.kshp[0] * self.kshp[1] * 2
+            flops = kerns[2] * kerns[3] * 2
            #nb flops by output image
-            self.flops *= self.outshp[0] * self.outshp[1]
+            flops *= out[2] * out[3]
-            # for all outputs images#n_stack==self.imshp[0]
+            # nb patch multiplied
-            self.flops *= self.imshp[0] * self.nkern * self.bsize
+            flops *= images[1] * kerns[0] * images[0]
-        else:  # full mode not implemented
+        else:
+            flops = (images[0] * kerns[0] * images[1] *
-            self.flops = 0
+                     kerns[2] * kerns[3] *
-            for out_row in xrange(self.outshp[0]):  # loop over output row
+                     images[2] * images[3] * 2)
-                for out_col in xrange(self.outshp[0]):  # loop over output col
+        return flops
-                    for row in xrange(self.kshp[0]):  # loop over kern row
-                        if (row + out_row - self.kshp[0] + 1 < 0 or
-                            row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
-                            continue
-                        col = 0
-                        max_col = self.kshp[1]
-                        img_col = out_col - self.kshp[1] + 1
-                        max_col = min(max_col, self.imshp[2] - img_col)
-                        if img_col < 0:
-                            col = -img_col
-                            img_col += col
-                        while col < max_col:  # loop over kern col
-                            self.flops += 2
-                            col += 1
-            # for all outputs images#n_stack==self.imshp[0]
-            self.flops *= self.imshp[0] * self.nkern * self.bsize
-            assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
-                    self.kshp[0] * self.kshp[1] * \
-                        self.imshp[1] * self.imshp[2] * 2
    def make_node(self, inputs, kerns):
        # TODO: find a way to make ConvOp work for N-D (after NIPS09)
@@ -917,9 +896,6 @@ class ConvOp(OpenMPOp):
                        version=self.version,
                        verbose=self.verbose)
-        if hasattr(self, 'flops'):
-            dw.set_flops()
        dw = dw(img, filters)
        if all_shape:
@@ -966,9 +942,6 @@ class ConvOp(OpenMPOp):
                         version=-1,  # we we change the mode, we don't forward the version.
                         verbose=self.verbose)
-        if hasattr(self, 'flops'):
-            din.set_flops()
        din = din(gz, filters)
        assert (din.owner.op.outshp is None and self.imshp is None) or \

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -52,7 +52,7 @@ def out2in(*local_opts, **kwargs):
    name = (kwargs and kwargs.pop('name', None))
    if len(local_opts) > 1:
        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts),
+        local_opts = opt.LocalOptGroup(*local_opts)
    else:
        local_opts, = local_opts
        if not name:
@@ -71,7 +71,7 @@ def in2out(*local_opts, **kwargs):
    name = (kwargs and kwargs.pop('name', None))
    if len(local_opts) > 1:
        # Don't wrap it uselessly if their is only 1 optimization.
-        local_opts = opt.LocalOptGroup(*local_opts),
+        local_opts = opt.LocalOptGroup(*local_opts)
    else:
        local_opts, = local_opts
        if not name: