Merge pull request #740 from nouiz/small

a few small stuff Small

Merge pull request #740 from nouiz/small
1cf82db6 · goodfeli · 8f549e6f · a25f5c07 · 1cf82db6 · 1cf82db6
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -613,3 +613,17 @@ import theano and print the config variable, as in:
    If set to True, breaks certain MacOS installations with the infamous
    Bus Error.
+.. attribute:: config.cmodule.remove_gxx_opt
+    Bool value, default: False
+    If True, will remove -O* parameter passed to g++.
+    This is useful to debug in gdb module compiled by Theano.
+    The parameter -g is passed by default to g++.
+.. attribute:: cmodule.compilation_warning
+    Bool value, default: False
+    If True, will print compilation warning.
--- a/doc/tutorial/faq.txt
+++ b/doc/tutorial/faq.txt
@@ -21,3 +21,15 @@ can't do this as we are working with symbolic variables. You can use
 Also we can't change the above error message into a more explicit one
 because of some other Python internal behavior that can't be modified.
+Faster gcc optimization
+-----------------------
+You can enable faster gcc optimization with the cxxflags. This list of flags was suggested on the mailing list::
+    cxxflags=-march=native -O3 -ffast-math -ftree-loop-distribution -funroll-loops -ftracer
+Use it at your own risk. Some people warned that the -ftree-loop-distribution optimization caused them wrong results in the past.
+Also the -march=native must be used with care if you have NFS. In that case, you MUST set the compiledir to a local path of the computer.
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -169,6 +169,51 @@ class ProfileStats(object):
            global _atexit_print_list
            _atexit_print_list.append(self)
+    def class_time(self):
+        """dict op -> total time on thunks"""
+        # timing is stored by node, we compute timing by class on demand
+        rval = {}
+        for node, t in self.apply_time.items():
+            typ = type(node.op)
+            rval.setdefault(typ, 0)
+            rval[typ] += t
+        return rval
+    def class_callcount(self):
+        """dict op -> total number of thunk calls"""
+        # timing is stored by node, we compute timing by class on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            typ = type(node.op)
+            rval.setdefault(typ, 0)
+            rval[typ] += count
+        return rval
+    def class_nodes(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by class on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            typ = type(node.op)
+            rval.setdefault(typ, 0)
+            rval[typ] += 1
+        return rval
+    def class_impl(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by class on demand
+        rval = {}
+        for node in self.apply_callcount:
+            typ = type(node.op)
+            if self.apply_cimpl[node]:
+                impl = 'C '
+            else:
+                impl = 'Py'
+            rval.setdefault(typ, impl)
+            if rval[typ] != impl and len(rval[typ]) == 2:
+                rval[typ] += impl
+        return rval
    def op_time(self):
        """dict op -> total time on thunks"""
        # timing is stored by node, we compute timing by Op on demand
@@ -233,6 +278,95 @@ class ProfileStats(object):
               ' <time per call> %s <nb_call> <nb apply> <Op name>' % (
                flops_msg))
+    def summary_class(self, file=sys.stderr, N=None):
+        if self.apply_time:
+            local_time = sum(self.apply_time.values())
+        else:
+            local_time = 0
+        if local_time == 0:
+            print >> file, ('ProfileMode.summary_class: total time 0'
+                    ' (did you forget to enable counters?)')
+            return
+        class_time = self.class_time()
+        class_call = self.class_callcount()
+        class_apply = self.class_nodes()
+#        class_flops = self.class_flops()
+        class_impl = self.class_impl()
+        if N is None:
+            N = len(self.class_time)
+        otimes = [(t * 100 / local_time,
+                    t,
+                    clas,
+                    class_impl.get(clas, '  '),
+                    class_call.get(clas, 0),
+                    class_apply.get(clas, 0))
+                for clas, t in class_time.items()]
+        otimes.sort()
+        otimes.reverse()
+        tot = 0
+        print >> file, 'Class'
+        print >> file, '---'
+        #print >> file, '<% time> <cumulative %%> <apply time>,'
+        #print >>file, '<cumulative seconds> <time per call> <nb_call>'
+        #print >>file, '<Class name>'
+        hs = []
+        # formatting string
+        es = []
+        hs += ['<% time>']
+        es += ['  %4.1f%% ']
+        hs += ['<sum %>']
+        es += [' %5.1f%% ']
+        hs += ['<apply time>']
+        es += ['   %7.3fs ']
+        hs += ['<time per call>']
+        es += ['     %8.2es ']
+        hs += ['<type>']
+        es += ['   %2s ']
+        hs += ['<#call>']
+        es += ['  %4d  ']
+        hs += ['<#apply>']
+        es += ['  %4d  ']
+        upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
+        maxlen = self.line_width - upto_length
+        hs += ['<Class name>']
+        es += ['%s']
+        header_str = ' '.join(hs)
+        format_str = ' '.join(es)
+        print >> file, header_str
+        for f, t, a, impl, nb_call, nb_apply in otimes[:N]:
+            if nb_call == 0:
+                assert t == 0
+                continue
+            tot += t
+            ftot = tot * 100 / local_time
+            print >> file, format_str % (f, ftot, t, t / nb_call,
+                                         impl, nb_call,
+                                         nb_apply, str(a)[:maxlen])
+            # While this carries over less information, it is arranged such
+            # that it way more readeable that the previous output of the
+            # profiler
+            #if op_flops:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
+            #else:
+            #    print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
+            #            f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
+        print >>file, '   ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
+                % (max(0, len(otimes) - N),
+                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
+        print >> file, ''
    def summary_ops(self, file=sys.stderr, N=None):
        if self.apply_time:
            local_time = sum(self.apply_time.values())
@@ -426,6 +560,7 @@ class ProfileStats(object):
        self.summary_function(file)
        local_time = sum(self.apply_time.values())
        if local_time > 0:
+            self.summary_class(file, n_ops_to_print)
            self.summary_ops(file, n_ops_to_print)
            self.summary_nodes(file, n_applies_to_print)
        elif self.fct_callcount > 0:

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -39,6 +39,17 @@ AddConfigVar('cmodule.warn_no_version',
             "with C code that can't be cached because there is no "
             "c_code_cache_version() function associated to at least one of "
             "those Ops.",
+             BoolParam(False),
+             in_c_key=False)
+AddConfigVar('cmodule.remove_gxx_opt',
+             "If True, will remove -O* parameter passed to g++."
+             "This is useful to debug in gdb module compiled by Theano."
+             "The parameter -g is passed by default to g++",
+             BoolParam(False))
+AddConfigVar('cmodule.compilation_warning',
+             "If True, will print compilation warning.",
             BoolParam(False))
@@ -1481,8 +1492,6 @@ class GCC_compiler(object):
            # We also add "-m64", in case the installed gcc is 32-bit
            preargs.append('-m64')
-        no_opt = False
        include_dirs = include_dirs + std_include_dirs()
        libs = std_libs() + libs
        lib_dirs = std_lib_dirs() + lib_dirs
@@ -1529,7 +1538,8 @@ class GCC_compiler(object):
        _logger.debug('Generating shared lib %s', lib_filename)
        cmd = ['g++', get_gcc_shared_library_arg(), '-g']
-        if no_opt:
+        if config.cmodule.remove_gxx_opt:
            cmd.extend(p for p in preargs if not p.startswith('-O'))
        else:
            cmd.extend(preargs)
@@ -1572,6 +1582,9 @@ class GCC_compiler(object):
            # difficult to read.
            raise Exception('Compilation failed (return status=%s): %s' %
                            (status, compile_stderr.replace('\n', '. ')))
+        elif config.cmodule.compilation_warning and compile_stderr:
+            # Print errors just below the command line.
+            print compile_stderr
        #touch the __init__ file
        file(os.path.join(location, "__init__.py"), 'w').close()

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -179,17 +179,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
                    _logger.warning('%s.grad returned a different type (%s) '
                            'for input %i of type (%s)',
                            node.op, g_r_type, ii, r_type)
-            #The following name assignment code is broken
-            #for example, when you call
-            #f = T.dot(x,T.dot(A,x))
-            #f.name = 'f'
-            #T.grad( f, x)
-            #the result has no name, and is composed of
-            # A x + A^T x
-            # with both terms named "(df/dx)"
-            #if g_r is not None and len(sources) == 1 and sources[0][0].name \
-            #        and r.name:
-            #    g_r.name = "(d%s/d%s)" % (sources[0][0].name, r.name)
            if g_r is not None:
                assert r is not None
                if r in gmap:

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1924,7 +1924,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
                assert self.perform_using_take == True, (
                    "GpuAdvancedSubtensor1 used the fast version")
            if idx.dtype != numpy.int64:
-                if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
+                if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
                                 numpy.int64, numpy.uint8, numpy.uint16,
                                 numpy.uint32]:
                    idx = idx.astype(numpy.int64)

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -325,6 +325,8 @@ class NVCC_compiler(object):
                print >> sys.stderr,  i + 1, l
            raise Exception('nvcc return status', p.returncode,
                            'for cmd', ' '.join(cmd))
+        elif config.cmodule.compilation_warning and nvcc_stdout:
+            print nvcc_stdout
        #touch the __init__ file
        file(os.path.join(location, "__init__.py"), 'w').close()

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -866,6 +866,8 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
                                (rand(1025, 67000), [5, 10], True),
                                (rand(3, 10, 68000), [1, 2], True),
                                (rand(3, 69000, 11), [1, 2], True),
+                                # use too much memory to enable by default.
+                                #(rand(2*10e7), [-1, 199999999], True),
                                (rand(4, 5), [2, 3], True),
                                (rand(4, 2, 3), [0, 3], True),
                                (rand(4, 2, 3), [3, 3, 1, 1, 2,
@@ -879,7 +881,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
                                                    -1, -2, -3, -4], False),
                            ]:
            data = numpy.asarray(data, dtype=self.dtype)
-            n = self.shared(data)
+            n = self.shared(data, borrow=True)
            # Test with c_contiguous input
            t = self.adv_sub1()(n, idx)

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
@@ -5,7 +5,7 @@ import theano
 from theano.tests import unittest_tools as utt
 from theano.tensor.extra_ops import *
 from theano import tensor as T
-from theano import tensor, function, scalar
+from theano import config, tensor, function, scalar
 class TestBinCountOp(utt.InferShapeTester):
@@ -16,9 +16,9 @@ class TestBinCountOp(utt.InferShapeTester):
    def test_bincountOp(self):
        x = T.lvector('x')
-        w = T.dvector('w')
+        w = T.vector('w')
        a = np.random.random_integers(50, size=(25))
-        weights = np.random.random((25,))
+        weights = np.random.random((25,)).astype(config.floatX)
        f1 = theano.function([x], bincount(x))
        f2 = theano.function([x, w], bincount(x, weights=w))
@@ -38,7 +38,7 @@ class TestBinCountOp(utt.InferShapeTester):
                                [np.random.random_integers(50, size=(25,))],
                                self.op_class)
-        weights = np.random.random((25,))
+        weights = np.random.random((25,)).astype(config.floatX)
        self._compile_and_check([x],
                                [bincount(x, weights=weights)],
                                [np.random.random_integers(50, size=(25,))],
@@ -64,8 +64,8 @@ class TestDiffOp(utt.InferShapeTester):
        self.op = DiffOp()
    def test_diffOp(self):
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        a = np.random.random((30, 50))
+        a = np.random.random((30, 50)).astype(config.floatX)
        f = theano.function([x], diff(x))
        assert np.allclose(np.diff(a), f(a))
@@ -76,8 +76,8 @@ class TestDiffOp(utt.InferShapeTester):
                assert np.allclose(np.diff(a, n=k, axis=axis), g(a))
    def test_infer_shape(self):
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        a = np.random.random((30, 50))
+        a = np.random.random((30, 50)).astype(config.floatX)
        self._compile_and_check([x],
                                [self.op(x)],
@@ -93,14 +93,14 @@ class TestDiffOp(utt.InferShapeTester):
    def test_grad(self):
        x = T.vector('x')
-        a = np.random.random(50)
+        a = np.random.random(50).astype(config.floatX)
        gf = theano.function([x], T.grad(T.sum(diff(x)), x))
        utt.verify_grad(self.op, [a])
        for k in range(TestDiffOp.nb):
            dg = theano.function([x], T.grad(T.sum(diff(x, n=k)), x))
-            utt.verify_grad(DiffOp(n=k), [a])
+            utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
 class TestSqueezeOp(utt.InferShapeTester):
@@ -110,27 +110,27 @@ class TestSqueezeOp(utt.InferShapeTester):
        self.op = SqueezeOp(out_nd=1)
    def test_squeezeOp(self):
-        x = T.dmatrix('x')
+        x = T.matrix('x')
-        a = np.random.random((1, 50))
+        a = np.random.random((1, 50)).astype(config.floatX)
        f = theano.function([x], squeeze(x, out_nd=1))
        assert np.allclose(np.squeeze(a), f(a))
-        x = T.dtensor4('x')
+        x = T.tensor4('x')
        f = theano.function([x], squeeze(x, out_nd=2))
-        a = np.random.random((1, 1, 2, 3))
+        a = np.random.random((1, 1, 2, 3)).astype(config.floatX)
        assert np.allclose(np.squeeze(a), f(a))
-        a = np.random.random((1, 2, 2, 1))
+        a = np.random.random((1, 2, 2, 1)).astype(config.floatX)
        assert np.allclose(np.squeeze(a), f(a))
-        a = np.random.random((4, 1, 2, 1))
+        a = np.random.random((4, 1, 2, 1)).astype(config.floatX)
        assert np.allclose(np.squeeze(a), f(a))
    def test_grad(self):
-        x = T.dtensor4('x')
+        x = T.tensor4('x')
-        a = np.random.random((1, 1, 3, 4))
+        a = np.random.random((1, 1, 3, 4)).astype(config.floatX)
        gf = theano.function([x], T.grad(T.sum(squeeze(x, out_nd=1)), x))
        utt.verify_grad(SqueezeOp(out_nd=2), [a])
@@ -147,8 +147,8 @@ class TestRepeatOp(utt.InferShapeTester):
    def test_repeatOp(self):
        for ndim in range(3):
-            x = T.TensorType(theano.config.floatX, [False] * ndim)()
+            x = T.TensorType(config.floatX, [False] * ndim)()
-            a = np.random.random((10, ) * ndim)
+            a = np.random.random((10, ) * ndim).astype(config.floatX)
            for axis in self._possible_axis(ndim):
                r_var = T.lscalar()
@@ -167,8 +167,8 @@ class TestRepeatOp(utt.InferShapeTester):
    def test_infer_shape(self):
        for ndim in range(4):
-            x = T.TensorType(theano.config.floatX, [False] * ndim)()
+            x = T.TensorType(config.floatX, [False] * ndim)()
-            a = np.random.random((10, ) * ndim)
+            a = np.random.random((10, ) * ndim).astype(config.floatX)
            for axis in self._possible_axis(ndim):
                r_var = T.lscalar()
@@ -191,7 +191,7 @@ class TestRepeatOp(utt.InferShapeTester):
    def test_grad(self):
        for ndim in range(3):
-            a = np.random.random((10, ) * ndim)
+            a = np.random.random((10, ) * ndim).astype(config.floatX)
            for axis in self._possible_axis(ndim):
                utt.verify_grad(lambda x: RepeatOp(axis=axis)(x, 3), [a])
@@ -240,23 +240,23 @@ class TestFillDiagonal(utt.InferShapeTester):
        self.op = fill_diagonal
    def test_perform(self):
-        x = tensor.dmatrix()
+        x = tensor.matrix()
-        y = tensor.dscalar()
+        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
        for shp in [(8, 8), (5, 8), (8, 5)]:
-            a = numpy.random.rand(*shp)
+            a = numpy.random.rand(*shp).astype(config.floatX)
-            val = numpy.random.rand()
+            val = numpy.cast[config.floatX](numpy.random.rand())
            out = f(a, val)
            # We can't use numpy.fill_diagonal as it is bugged.
            assert numpy.allclose(numpy.diag(out), val)
            assert (out == val).sum() == min(a.shape)
        # test for 3d tensor
-        a = numpy.random.rand(3, 3, 3)
+        a = numpy.random.rand(3, 3, 3).astype(config.floatX)
-        x = tensor.dtensor3()
+        x = tensor.tensor3()
-        y = tensor.dscalar()
+        y = tensor.scalar()
        f = function([x, y], fill_diagonal(x, y))
-        val = numpy.random.rand() + 10
+        val = numpy.cast[config.floatX](numpy.random.rand() + 10)
        out = f(a, val)
        # We can't use numpy.fill_diagonal as it is bugged.
        assert out[0, 0, 0] == val