提交 1cf82db6 authored 作者: goodfeli's avatar goodfeli

Merge pull request #740 from nouiz/small

a few small stuff Small
......@@ -613,3 +613,17 @@ import theano and print the config variable, as in:
If set to True, breaks certain MacOS installations with the infamous
Bus Error.
.. attribute:: config.cmodule.remove_gxx_opt
Bool value, default: False
If True, will remove -O* parameter passed to g++.
This is useful to debug in gdb module compiled by Theano.
The parameter -g is passed by default to g++.
.. attribute:: cmodule.compilation_warning
Bool value, default: False
If True, will print compilation warning.
......@@ -21,3 +21,15 @@ can't do this as we are working with symbolic variables. You can use
Also we can't change the above error message into a more explicit one
because of some other Python internal behavior that can't be modified.
Faster gcc optimization
-----------------------
You can enable faster gcc optimization with the cxxflags. This list of flags was suggested on the mailing list::
cxxflags=-march=native -O3 -ffast-math -ftree-loop-distribution -funroll-loops -ftracer
Use it at your own risk. Some people warned that the -ftree-loop-distribution optimization caused them wrong results in the past.
Also the -march=native must be used with care if you have NFS. In that case, you MUST set the compiledir to a local path of the computer.
......@@ -169,6 +169,51 @@ class ProfileStats(object):
global _atexit_print_list
_atexit_print_list.append(self)
def class_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, t in self.apply_time.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += t
return rval
def class_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, count in self.apply_callcount.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += count
return rval
def class_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, count in self.apply_callcount.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += 1
return rval
def class_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node in self.apply_callcount:
typ = type(node.op)
if self.apply_cimpl[node]:
impl = 'C '
else:
impl = 'Py'
rval.setdefault(typ, impl)
if rval[typ] != impl and len(rval[typ]) == 2:
rval[typ] += impl
return rval
def op_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by Op on demand
......@@ -233,6 +278,95 @@ class ProfileStats(object):
' <time per call> %s <nb_call> <nb apply> <Op name>' % (
flops_msg))
def summary_class(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_class: total time 0'
' (did you forget to enable counters?)')
return
class_time = self.class_time()
class_call = self.class_callcount()
class_apply = self.class_nodes()
# class_flops = self.class_flops()
class_impl = self.class_impl()
if N is None:
N = len(self.class_time)
otimes = [(t * 100 / local_time,
t,
clas,
class_impl.get(clas, ' '),
class_call.get(clas, 0),
class_apply.get(clas, 0))
for clas, t in class_time.items()]
otimes.sort()
otimes.reverse()
tot = 0
print >> file, 'Class'
print >> file, '---'
#print >> file, '<% time> <cumulative %%> <apply time>,'
#print >>file, '<cumulative seconds> <time per call> <nb_call>'
#print >>file, '<Class name>'
hs = []
# formatting string
es = []
hs += ['<% time>']
es += [' %4.1f%% ']
hs += ['<sum %>']
es += [' %5.1f%% ']
hs += ['<apply time>']
es += [' %7.3fs ']
hs += ['<time per call>']
es += [' %8.2es ']
hs += ['<type>']
es += [' %2s ']
hs += ['<#call>']
es += [' %4d ']
hs += ['<#apply>']
es += [' %4d ']
upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
maxlen = self.line_width - upto_length
hs += ['<Class name>']
es += ['%s']
header_str = ' '.join(hs)
format_str = ' '.join(es)
print >> file, header_str
for f, t, a, impl, nb_call, nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot += t
ftot = tot * 100 / local_time
print >> file, format_str % (f, ftot, t, t / nb_call,
impl, nb_call,
nb_apply, str(a)[:maxlen])
# While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the
# profiler
#if op_flops:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
#else:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
......@@ -426,6 +560,7 @@ class ProfileStats(object):
self.summary_function(file)
local_time = sum(self.apply_time.values())
if local_time > 0:
self.summary_class(file, n_ops_to_print)
self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print)
elif self.fct_callcount > 0:
......
......@@ -39,6 +39,17 @@ AddConfigVar('cmodule.warn_no_version',
"with C code that can't be cached because there is no "
"c_code_cache_version() function associated to at least one of "
"those Ops.",
BoolParam(False),
in_c_key=False)
AddConfigVar('cmodule.remove_gxx_opt',
"If True, will remove -O* parameter passed to g++."
"This is useful to debug in gdb module compiled by Theano."
"The parameter -g is passed by default to g++",
BoolParam(False))
AddConfigVar('cmodule.compilation_warning',
"If True, will print compilation warning.",
BoolParam(False))
......@@ -1481,8 +1492,6 @@ class GCC_compiler(object):
# We also add "-m64", in case the installed gcc is 32-bit
preargs.append('-m64')
no_opt = False
include_dirs = include_dirs + std_include_dirs()
libs = std_libs() + libs
lib_dirs = std_lib_dirs() + lib_dirs
......@@ -1529,7 +1538,8 @@ class GCC_compiler(object):
_logger.debug('Generating shared lib %s', lib_filename)
cmd = ['g++', get_gcc_shared_library_arg(), '-g']
if no_opt:
if config.cmodule.remove_gxx_opt:
cmd.extend(p for p in preargs if not p.startswith('-O'))
else:
cmd.extend(preargs)
......@@ -1572,6 +1582,9 @@ class GCC_compiler(object):
# difficult to read.
raise Exception('Compilation failed (return status=%s): %s' %
(status, compile_stderr.replace('\n', '. ')))
elif config.cmodule.compilation_warning and compile_stderr:
# Print errors just below the command line.
print compile_stderr
#touch the __init__ file
file(os.path.join(location, "__init__.py"), 'w').close()
......
......@@ -179,17 +179,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
_logger.warning('%s.grad returned a different type (%s) '
'for input %i of type (%s)',
node.op, g_r_type, ii, r_type)
#The following name assignment code is broken
#for example, when you call
#f = T.dot(x,T.dot(A,x))
#f.name = 'f'
#T.grad( f, x)
#the result has no name, and is composed of
# A x + A^T x
# with both terms named "(df/dx)"
#if g_r is not None and len(sources) == 1 and sources[0][0].name \
# and r.name:
# g_r.name = "(d%s/d%s)" % (sources[0][0].name, r.name)
if g_r is not None:
assert r is not None
if r in gmap:
......
......@@ -1924,7 +1924,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64:
if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
numpy.int64, numpy.uint8, numpy.uint16,
numpy.uint32]:
idx = idx.astype(numpy.int64)
......
......@@ -325,6 +325,8 @@ class NVCC_compiler(object):
print >> sys.stderr, i + 1, l
raise Exception('nvcc return status', p.returncode,
'for cmd', ' '.join(cmd))
elif config.cmodule.compilation_warning and nvcc_stdout:
print nvcc_stdout
#touch the __init__ file
file(os.path.join(location, "__init__.py"), 'w').close()
......
......@@ -866,6 +866,8 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
(rand(1025, 67000), [5, 10], True),
(rand(3, 10, 68000), [1, 2], True),
(rand(3, 69000, 11), [1, 2], True),
# use too much memory to enable by default.
#(rand(2*10e7), [-1, 199999999], True),
(rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2,
......@@ -879,7 +881,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
-1, -2, -3, -4], False),
]:
data = numpy.asarray(data, dtype=self.dtype)
n = self.shared(data)
n = self.shared(data, borrow=True)
# Test with c_contiguous input
t = self.adv_sub1()(n, idx)
......
......@@ -5,7 +5,7 @@ import theano
from theano.tests import unittest_tools as utt
from theano.tensor.extra_ops import *
from theano import tensor as T
from theano import tensor, function, scalar
from theano import config, tensor, function, scalar
class TestBinCountOp(utt.InferShapeTester):
......@@ -16,9 +16,9 @@ class TestBinCountOp(utt.InferShapeTester):
def test_bincountOp(self):
x = T.lvector('x')
w = T.dvector('w')
w = T.vector('w')
a = np.random.random_integers(50, size=(25))
weights = np.random.random((25,))
weights = np.random.random((25,)).astype(config.floatX)
f1 = theano.function([x], bincount(x))
f2 = theano.function([x, w], bincount(x, weights=w))
......@@ -38,7 +38,7 @@ class TestBinCountOp(utt.InferShapeTester):
[np.random.random_integers(50, size=(25,))],
self.op_class)
weights = np.random.random((25,))
weights = np.random.random((25,)).astype(config.floatX)
self._compile_and_check([x],
[bincount(x, weights=weights)],
[np.random.random_integers(50, size=(25,))],
......@@ -64,8 +64,8 @@ class TestDiffOp(utt.InferShapeTester):
self.op = DiffOp()
def test_diffOp(self):
x = T.dmatrix('x')
a = np.random.random((30, 50))
x = T.matrix('x')
a = np.random.random((30, 50)).astype(config.floatX)
f = theano.function([x], diff(x))
assert np.allclose(np.diff(a), f(a))
......@@ -76,8 +76,8 @@ class TestDiffOp(utt.InferShapeTester):
assert np.allclose(np.diff(a, n=k, axis=axis), g(a))
def test_infer_shape(self):
x = T.dmatrix('x')
a = np.random.random((30, 50))
x = T.matrix('x')
a = np.random.random((30, 50)).astype(config.floatX)
self._compile_and_check([x],
[self.op(x)],
......@@ -93,14 +93,14 @@ class TestDiffOp(utt.InferShapeTester):
def test_grad(self):
x = T.vector('x')
a = np.random.random(50)
a = np.random.random(50).astype(config.floatX)
gf = theano.function([x], T.grad(T.sum(diff(x)), x))
utt.verify_grad(self.op, [a])
for k in range(TestDiffOp.nb):
dg = theano.function([x], T.grad(T.sum(diff(x, n=k)), x))
utt.verify_grad(DiffOp(n=k), [a])
utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
class TestSqueezeOp(utt.InferShapeTester):
......@@ -110,27 +110,27 @@ class TestSqueezeOp(utt.InferShapeTester):
self.op = SqueezeOp(out_nd=1)
def test_squeezeOp(self):
x = T.dmatrix('x')
a = np.random.random((1, 50))
x = T.matrix('x')
a = np.random.random((1, 50)).astype(config.floatX)
f = theano.function([x], squeeze(x, out_nd=1))
assert np.allclose(np.squeeze(a), f(a))
x = T.dtensor4('x')
x = T.tensor4('x')
f = theano.function([x], squeeze(x, out_nd=2))
a = np.random.random((1, 1, 2, 3))
a = np.random.random((1, 1, 2, 3)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a))
a = np.random.random((1, 2, 2, 1))
a = np.random.random((1, 2, 2, 1)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a))
a = np.random.random((4, 1, 2, 1))
a = np.random.random((4, 1, 2, 1)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a))
def test_grad(self):
x = T.dtensor4('x')
a = np.random.random((1, 1, 3, 4))
x = T.tensor4('x')
a = np.random.random((1, 1, 3, 4)).astype(config.floatX)
gf = theano.function([x], T.grad(T.sum(squeeze(x, out_nd=1)), x))
utt.verify_grad(SqueezeOp(out_nd=2), [a])
......@@ -147,8 +147,8 @@ class TestRepeatOp(utt.InferShapeTester):
def test_repeatOp(self):
for ndim in range(3):
x = T.TensorType(theano.config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim)
x = T.TensorType(config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim):
r_var = T.lscalar()
......@@ -167,8 +167,8 @@ class TestRepeatOp(utt.InferShapeTester):
def test_infer_shape(self):
for ndim in range(4):
x = T.TensorType(theano.config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim)
x = T.TensorType(config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim):
r_var = T.lscalar()
......@@ -191,7 +191,7 @@ class TestRepeatOp(utt.InferShapeTester):
def test_grad(self):
for ndim in range(3):
a = np.random.random((10, ) * ndim)
a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim):
utt.verify_grad(lambda x: RepeatOp(axis=axis)(x, 3), [a])
......@@ -240,23 +240,23 @@ class TestFillDiagonal(utt.InferShapeTester):
self.op = fill_diagonal
def test_perform(self):
x = tensor.dmatrix()
y = tensor.dscalar()
x = tensor.matrix()
y = tensor.scalar()
f = function([x, y], fill_diagonal(x, y))
for shp in [(8, 8), (5, 8), (8, 5)]:
a = numpy.random.rand(*shp)
val = numpy.random.rand()
a = numpy.random.rand(*shp).astype(config.floatX)
val = numpy.cast[config.floatX](numpy.random.rand())
out = f(a, val)
# We can't use numpy.fill_diagonal as it is bugged.
assert numpy.allclose(numpy.diag(out), val)
assert (out == val).sum() == min(a.shape)
# test for 3d tensor
a = numpy.random.rand(3, 3, 3)
x = tensor.dtensor3()
y = tensor.dscalar()
a = numpy.random.rand(3, 3, 3).astype(config.floatX)
x = tensor.tensor3()
y = tensor.scalar()
f = function([x, y], fill_diagonal(x, y))
val = numpy.random.rand() + 10
val = numpy.cast[config.floatX](numpy.random.rand() + 10)
out = f(a, val)
# We can't use numpy.fill_diagonal as it is bugged.
assert out[0, 0, 0] == val
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论