提交 1cf82db6 authored 作者: goodfeli's avatar goodfeli

Merge pull request #740 from nouiz/small

a few small stuff Small
...@@ -613,3 +613,17 @@ import theano and print the config variable, as in: ...@@ -613,3 +613,17 @@ import theano and print the config variable, as in:
If set to True, breaks certain MacOS installations with the infamous If set to True, breaks certain MacOS installations with the infamous
Bus Error. Bus Error.
.. attribute:: config.cmodule.remove_gxx_opt
Bool value, default: False
If True, will remove -O* parameter passed to g++.
This is useful to debug in gdb module compiled by Theano.
The parameter -g is passed by default to g++.
.. attribute:: cmodule.compilation_warning
Bool value, default: False
If True, will print compilation warning.
...@@ -21,3 +21,15 @@ can't do this as we are working with symbolic variables. You can use ...@@ -21,3 +21,15 @@ can't do this as we are working with symbolic variables. You can use
Also we can't change the above error message into a more explicit one Also we can't change the above error message into a more explicit one
because of some other Python internal behavior that can't be modified. because of some other Python internal behavior that can't be modified.
Faster gcc optimization
-----------------------
You can enable faster gcc optimization with the cxxflags. This list of flags was suggested on the mailing list::
cxxflags=-march=native -O3 -ffast-math -ftree-loop-distribution -funroll-loops -ftracer
Use it at your own risk. Some people warned that the -ftree-loop-distribution optimization caused them wrong results in the past.
Also the -march=native must be used with care if you have NFS. In that case, you MUST set the compiledir to a local path of the computer.
...@@ -169,6 +169,51 @@ class ProfileStats(object): ...@@ -169,6 +169,51 @@ class ProfileStats(object):
global _atexit_print_list global _atexit_print_list
_atexit_print_list.append(self) _atexit_print_list.append(self)
def class_time(self):
"""dict op -> total time on thunks"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, t in self.apply_time.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += t
return rval
def class_callcount(self):
"""dict op -> total number of thunk calls"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, count in self.apply_callcount.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += count
return rval
def class_nodes(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node, count in self.apply_callcount.items():
typ = type(node.op)
rval.setdefault(typ, 0)
rval[typ] += 1
return rval
def class_impl(self):
"""dict op -> total number of nodes"""
# timing is stored by node, we compute timing by class on demand
rval = {}
for node in self.apply_callcount:
typ = type(node.op)
if self.apply_cimpl[node]:
impl = 'C '
else:
impl = 'Py'
rval.setdefault(typ, impl)
if rval[typ] != impl and len(rval[typ]) == 2:
rval[typ] += impl
return rval
def op_time(self): def op_time(self):
"""dict op -> total time on thunks""" """dict op -> total time on thunks"""
# timing is stored by node, we compute timing by Op on demand # timing is stored by node, we compute timing by Op on demand
...@@ -233,6 +278,95 @@ class ProfileStats(object): ...@@ -233,6 +278,95 @@ class ProfileStats(object):
' <time per call> %s <nb_call> <nb apply> <Op name>' % ( ' <time per call> %s <nb_call> <nb apply> <Op name>' % (
flops_msg)) flops_msg))
def summary_class(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
else:
local_time = 0
if local_time == 0:
print >> file, ('ProfileMode.summary_class: total time 0'
' (did you forget to enable counters?)')
return
class_time = self.class_time()
class_call = self.class_callcount()
class_apply = self.class_nodes()
# class_flops = self.class_flops()
class_impl = self.class_impl()
if N is None:
N = len(self.class_time)
otimes = [(t * 100 / local_time,
t,
clas,
class_impl.get(clas, ' '),
class_call.get(clas, 0),
class_apply.get(clas, 0))
for clas, t in class_time.items()]
otimes.sort()
otimes.reverse()
tot = 0
print >> file, 'Class'
print >> file, '---'
#print >> file, '<% time> <cumulative %%> <apply time>,'
#print >>file, '<cumulative seconds> <time per call> <nb_call>'
#print >>file, '<Class name>'
hs = []
# formatting string
es = []
hs += ['<% time>']
es += [' %4.1f%% ']
hs += ['<sum %>']
es += [' %5.1f%% ']
hs += ['<apply time>']
es += [' %7.3fs ']
hs += ['<time per call>']
es += [' %8.2es ']
hs += ['<type>']
es += [' %2s ']
hs += ['<#call>']
es += [' %4d ']
hs += ['<#apply>']
es += [' %4d ']
upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
maxlen = self.line_width - upto_length
hs += ['<Class name>']
es += ['%s']
header_str = ' '.join(hs)
format_str = ' '.join(es)
print >> file, header_str
for f, t, a, impl, nb_call, nb_apply in otimes[:N]:
if nb_call == 0:
assert t == 0
continue
tot += t
ftot = tot * 100 / local_time
print >> file, format_str % (f, ftot, t, t / nb_call,
impl, nb_call,
nb_apply, str(a)[:maxlen])
# While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the
# profiler
#if op_flops:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
#else:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
print >> file, ''
def summary_ops(self, file=sys.stderr, N=None): def summary_ops(self, file=sys.stderr, N=None):
if self.apply_time: if self.apply_time:
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
...@@ -426,6 +560,7 @@ class ProfileStats(object): ...@@ -426,6 +560,7 @@ class ProfileStats(object):
self.summary_function(file) self.summary_function(file)
local_time = sum(self.apply_time.values()) local_time = sum(self.apply_time.values())
if local_time > 0: if local_time > 0:
self.summary_class(file, n_ops_to_print)
self.summary_ops(file, n_ops_to_print) self.summary_ops(file, n_ops_to_print)
self.summary_nodes(file, n_applies_to_print) self.summary_nodes(file, n_applies_to_print)
elif self.fct_callcount > 0: elif self.fct_callcount > 0:
......
...@@ -39,6 +39,17 @@ AddConfigVar('cmodule.warn_no_version', ...@@ -39,6 +39,17 @@ AddConfigVar('cmodule.warn_no_version',
"with C code that can't be cached because there is no " "with C code that can't be cached because there is no "
"c_code_cache_version() function associated to at least one of " "c_code_cache_version() function associated to at least one of "
"those Ops.", "those Ops.",
BoolParam(False),
in_c_key=False)
AddConfigVar('cmodule.remove_gxx_opt',
"If True, will remove -O* parameter passed to g++."
"This is useful to debug in gdb module compiled by Theano."
"The parameter -g is passed by default to g++",
BoolParam(False))
AddConfigVar('cmodule.compilation_warning',
"If True, will print compilation warning.",
BoolParam(False)) BoolParam(False))
...@@ -1481,8 +1492,6 @@ class GCC_compiler(object): ...@@ -1481,8 +1492,6 @@ class GCC_compiler(object):
# We also add "-m64", in case the installed gcc is 32-bit # We also add "-m64", in case the installed gcc is 32-bit
preargs.append('-m64') preargs.append('-m64')
no_opt = False
include_dirs = include_dirs + std_include_dirs() include_dirs = include_dirs + std_include_dirs()
libs = std_libs() + libs libs = std_libs() + libs
lib_dirs = std_lib_dirs() + lib_dirs lib_dirs = std_lib_dirs() + lib_dirs
...@@ -1529,7 +1538,8 @@ class GCC_compiler(object): ...@@ -1529,7 +1538,8 @@ class GCC_compiler(object):
_logger.debug('Generating shared lib %s', lib_filename) _logger.debug('Generating shared lib %s', lib_filename)
cmd = ['g++', get_gcc_shared_library_arg(), '-g'] cmd = ['g++', get_gcc_shared_library_arg(), '-g']
if no_opt:
if config.cmodule.remove_gxx_opt:
cmd.extend(p for p in preargs if not p.startswith('-O')) cmd.extend(p for p in preargs if not p.startswith('-O'))
else: else:
cmd.extend(preargs) cmd.extend(preargs)
...@@ -1572,6 +1582,9 @@ class GCC_compiler(object): ...@@ -1572,6 +1582,9 @@ class GCC_compiler(object):
# difficult to read. # difficult to read.
raise Exception('Compilation failed (return status=%s): %s' % raise Exception('Compilation failed (return status=%s): %s' %
(status, compile_stderr.replace('\n', '. '))) (status, compile_stderr.replace('\n', '. ')))
elif config.cmodule.compilation_warning and compile_stderr:
# Print errors just below the command line.
print compile_stderr
#touch the __init__ file #touch the __init__ file
file(os.path.join(location, "__init__.py"), 'w').close() file(os.path.join(location, "__init__.py"), 'w').close()
......
...@@ -179,17 +179,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True): ...@@ -179,17 +179,6 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
_logger.warning('%s.grad returned a different type (%s) ' _logger.warning('%s.grad returned a different type (%s) '
'for input %i of type (%s)', 'for input %i of type (%s)',
node.op, g_r_type, ii, r_type) node.op, g_r_type, ii, r_type)
#The following name assignment code is broken
#for example, when you call
#f = T.dot(x,T.dot(A,x))
#f.name = 'f'
#T.grad( f, x)
#the result has no name, and is composed of
# A x + A^T x
# with both terms named "(df/dx)"
#if g_r is not None and len(sources) == 1 and sources[0][0].name \
# and r.name:
# g_r.name = "(d%s/d%s)" % (sources[0][0].name, r.name)
if g_r is not None: if g_r is not None:
assert r is not None assert r is not None
if r in gmap: if r in gmap:
......
...@@ -1924,7 +1924,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1924,7 +1924,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
assert self.perform_using_take == True, ( assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version") "GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64: if idx.dtype != numpy.int64:
if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32, if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
numpy.int64, numpy.uint8, numpy.uint16, numpy.int64, numpy.uint8, numpy.uint16,
numpy.uint32]: numpy.uint32]:
idx = idx.astype(numpy.int64) idx = idx.astype(numpy.int64)
......
...@@ -325,6 +325,8 @@ class NVCC_compiler(object): ...@@ -325,6 +325,8 @@ class NVCC_compiler(object):
print >> sys.stderr, i + 1, l print >> sys.stderr, i + 1, l
raise Exception('nvcc return status', p.returncode, raise Exception('nvcc return status', p.returncode,
'for cmd', ' '.join(cmd)) 'for cmd', ' '.join(cmd))
elif config.cmodule.compilation_warning and nvcc_stdout:
print nvcc_stdout
#touch the __init__ file #touch the __init__ file
file(os.path.join(location, "__init__.py"), 'w').close() file(os.path.join(location, "__init__.py"), 'w').close()
......
...@@ -866,6 +866,8 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -866,6 +866,8 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
(rand(1025, 67000), [5, 10], True), (rand(1025, 67000), [5, 10], True),
(rand(3, 10, 68000), [1, 2], True), (rand(3, 10, 68000), [1, 2], True),
(rand(3, 69000, 11), [1, 2], True), (rand(3, 69000, 11), [1, 2], True),
# use too much memory to enable by default.
#(rand(2*10e7), [-1, 199999999], True),
(rand(4, 5), [2, 3], True), (rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True), (rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2, (rand(4, 2, 3), [3, 3, 1, 1, 2,
...@@ -879,7 +881,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -879,7 +881,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
-1, -2, -3, -4], False), -1, -2, -3, -4], False),
]: ]:
data = numpy.asarray(data, dtype=self.dtype) data = numpy.asarray(data, dtype=self.dtype)
n = self.shared(data) n = self.shared(data, borrow=True)
# Test with c_contiguous input # Test with c_contiguous input
t = self.adv_sub1()(n, idx) t = self.adv_sub1()(n, idx)
......
...@@ -5,7 +5,7 @@ import theano ...@@ -5,7 +5,7 @@ import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.extra_ops import * from theano.tensor.extra_ops import *
from theano import tensor as T from theano import tensor as T
from theano import tensor, function, scalar from theano import config, tensor, function, scalar
class TestBinCountOp(utt.InferShapeTester): class TestBinCountOp(utt.InferShapeTester):
...@@ -16,9 +16,9 @@ class TestBinCountOp(utt.InferShapeTester): ...@@ -16,9 +16,9 @@ class TestBinCountOp(utt.InferShapeTester):
def test_bincountOp(self): def test_bincountOp(self):
x = T.lvector('x') x = T.lvector('x')
w = T.dvector('w') w = T.vector('w')
a = np.random.random_integers(50, size=(25)) a = np.random.random_integers(50, size=(25))
weights = np.random.random((25,)) weights = np.random.random((25,)).astype(config.floatX)
f1 = theano.function([x], bincount(x)) f1 = theano.function([x], bincount(x))
f2 = theano.function([x, w], bincount(x, weights=w)) f2 = theano.function([x, w], bincount(x, weights=w))
...@@ -38,7 +38,7 @@ class TestBinCountOp(utt.InferShapeTester): ...@@ -38,7 +38,7 @@ class TestBinCountOp(utt.InferShapeTester):
[np.random.random_integers(50, size=(25,))], [np.random.random_integers(50, size=(25,))],
self.op_class) self.op_class)
weights = np.random.random((25,)) weights = np.random.random((25,)).astype(config.floatX)
self._compile_and_check([x], self._compile_and_check([x],
[bincount(x, weights=weights)], [bincount(x, weights=weights)],
[np.random.random_integers(50, size=(25,))], [np.random.random_integers(50, size=(25,))],
...@@ -64,8 +64,8 @@ class TestDiffOp(utt.InferShapeTester): ...@@ -64,8 +64,8 @@ class TestDiffOp(utt.InferShapeTester):
self.op = DiffOp() self.op = DiffOp()
def test_diffOp(self): def test_diffOp(self):
x = T.dmatrix('x') x = T.matrix('x')
a = np.random.random((30, 50)) a = np.random.random((30, 50)).astype(config.floatX)
f = theano.function([x], diff(x)) f = theano.function([x], diff(x))
assert np.allclose(np.diff(a), f(a)) assert np.allclose(np.diff(a), f(a))
...@@ -76,8 +76,8 @@ class TestDiffOp(utt.InferShapeTester): ...@@ -76,8 +76,8 @@ class TestDiffOp(utt.InferShapeTester):
assert np.allclose(np.diff(a, n=k, axis=axis), g(a)) assert np.allclose(np.diff(a, n=k, axis=axis), g(a))
def test_infer_shape(self): def test_infer_shape(self):
x = T.dmatrix('x') x = T.matrix('x')
a = np.random.random((30, 50)) a = np.random.random((30, 50)).astype(config.floatX)
self._compile_and_check([x], self._compile_and_check([x],
[self.op(x)], [self.op(x)],
...@@ -93,14 +93,14 @@ class TestDiffOp(utt.InferShapeTester): ...@@ -93,14 +93,14 @@ class TestDiffOp(utt.InferShapeTester):
def test_grad(self): def test_grad(self):
x = T.vector('x') x = T.vector('x')
a = np.random.random(50) a = np.random.random(50).astype(config.floatX)
gf = theano.function([x], T.grad(T.sum(diff(x)), x)) gf = theano.function([x], T.grad(T.sum(diff(x)), x))
utt.verify_grad(self.op, [a]) utt.verify_grad(self.op, [a])
for k in range(TestDiffOp.nb): for k in range(TestDiffOp.nb):
dg = theano.function([x], T.grad(T.sum(diff(x, n=k)), x)) dg = theano.function([x], T.grad(T.sum(diff(x, n=k)), x))
utt.verify_grad(DiffOp(n=k), [a]) utt.verify_grad(DiffOp(n=k), [a], eps=7e-3)
class TestSqueezeOp(utt.InferShapeTester): class TestSqueezeOp(utt.InferShapeTester):
...@@ -110,27 +110,27 @@ class TestSqueezeOp(utt.InferShapeTester): ...@@ -110,27 +110,27 @@ class TestSqueezeOp(utt.InferShapeTester):
self.op = SqueezeOp(out_nd=1) self.op = SqueezeOp(out_nd=1)
def test_squeezeOp(self): def test_squeezeOp(self):
x = T.dmatrix('x') x = T.matrix('x')
a = np.random.random((1, 50)) a = np.random.random((1, 50)).astype(config.floatX)
f = theano.function([x], squeeze(x, out_nd=1)) f = theano.function([x], squeeze(x, out_nd=1))
assert np.allclose(np.squeeze(a), f(a)) assert np.allclose(np.squeeze(a), f(a))
x = T.dtensor4('x') x = T.tensor4('x')
f = theano.function([x], squeeze(x, out_nd=2)) f = theano.function([x], squeeze(x, out_nd=2))
a = np.random.random((1, 1, 2, 3)) a = np.random.random((1, 1, 2, 3)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a)) assert np.allclose(np.squeeze(a), f(a))
a = np.random.random((1, 2, 2, 1)) a = np.random.random((1, 2, 2, 1)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a)) assert np.allclose(np.squeeze(a), f(a))
a = np.random.random((4, 1, 2, 1)) a = np.random.random((4, 1, 2, 1)).astype(config.floatX)
assert np.allclose(np.squeeze(a), f(a)) assert np.allclose(np.squeeze(a), f(a))
def test_grad(self): def test_grad(self):
x = T.dtensor4('x') x = T.tensor4('x')
a = np.random.random((1, 1, 3, 4)) a = np.random.random((1, 1, 3, 4)).astype(config.floatX)
gf = theano.function([x], T.grad(T.sum(squeeze(x, out_nd=1)), x)) gf = theano.function([x], T.grad(T.sum(squeeze(x, out_nd=1)), x))
utt.verify_grad(SqueezeOp(out_nd=2), [a]) utt.verify_grad(SqueezeOp(out_nd=2), [a])
...@@ -147,8 +147,8 @@ class TestRepeatOp(utt.InferShapeTester): ...@@ -147,8 +147,8 @@ class TestRepeatOp(utt.InferShapeTester):
def test_repeatOp(self): def test_repeatOp(self):
for ndim in range(3): for ndim in range(3):
x = T.TensorType(theano.config.floatX, [False] * ndim)() x = T.TensorType(config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim) a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim): for axis in self._possible_axis(ndim):
r_var = T.lscalar() r_var = T.lscalar()
...@@ -167,8 +167,8 @@ class TestRepeatOp(utt.InferShapeTester): ...@@ -167,8 +167,8 @@ class TestRepeatOp(utt.InferShapeTester):
def test_infer_shape(self): def test_infer_shape(self):
for ndim in range(4): for ndim in range(4):
x = T.TensorType(theano.config.floatX, [False] * ndim)() x = T.TensorType(config.floatX, [False] * ndim)()
a = np.random.random((10, ) * ndim) a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim): for axis in self._possible_axis(ndim):
r_var = T.lscalar() r_var = T.lscalar()
...@@ -191,7 +191,7 @@ class TestRepeatOp(utt.InferShapeTester): ...@@ -191,7 +191,7 @@ class TestRepeatOp(utt.InferShapeTester):
def test_grad(self): def test_grad(self):
for ndim in range(3): for ndim in range(3):
a = np.random.random((10, ) * ndim) a = np.random.random((10, ) * ndim).astype(config.floatX)
for axis in self._possible_axis(ndim): for axis in self._possible_axis(ndim):
utt.verify_grad(lambda x: RepeatOp(axis=axis)(x, 3), [a]) utt.verify_grad(lambda x: RepeatOp(axis=axis)(x, 3), [a])
...@@ -240,23 +240,23 @@ class TestFillDiagonal(utt.InferShapeTester): ...@@ -240,23 +240,23 @@ class TestFillDiagonal(utt.InferShapeTester):
self.op = fill_diagonal self.op = fill_diagonal
def test_perform(self): def test_perform(self):
x = tensor.dmatrix() x = tensor.matrix()
y = tensor.dscalar() y = tensor.scalar()
f = function([x, y], fill_diagonal(x, y)) f = function([x, y], fill_diagonal(x, y))
for shp in [(8, 8), (5, 8), (8, 5)]: for shp in [(8, 8), (5, 8), (8, 5)]:
a = numpy.random.rand(*shp) a = numpy.random.rand(*shp).astype(config.floatX)
val = numpy.random.rand() val = numpy.cast[config.floatX](numpy.random.rand())
out = f(a, val) out = f(a, val)
# We can't use numpy.fill_diagonal as it is bugged. # We can't use numpy.fill_diagonal as it is bugged.
assert numpy.allclose(numpy.diag(out), val) assert numpy.allclose(numpy.diag(out), val)
assert (out == val).sum() == min(a.shape) assert (out == val).sum() == min(a.shape)
# test for 3d tensor # test for 3d tensor
a = numpy.random.rand(3, 3, 3) a = numpy.random.rand(3, 3, 3).astype(config.floatX)
x = tensor.dtensor3() x = tensor.tensor3()
y = tensor.dscalar() y = tensor.scalar()
f = function([x, y], fill_diagonal(x, y)) f = function([x, y], fill_diagonal(x, y))
val = numpy.random.rand() + 10 val = numpy.cast[config.floatX](numpy.random.rand() + 10)
out = f(a, val) out = f(a, val)
# We can't use numpy.fill_diagonal as it is bugged. # We can't use numpy.fill_diagonal as it is bugged.
assert out[0, 0, 0] == val assert out[0, 0, 0] == val
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论