提交 55c0b9e6 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #1570 from nouiz/flops

Flops
......@@ -330,6 +330,16 @@ following methods:
shape without computing the output itself, potentially sparing you
a costly recomputation.
.. function:: flops(inputs, outputs)
Optional.
It is only used to have more information printed by the memory
profiler. It make it print the mega flops and giga flops per
second for each apply node. It take as inputs two list: one for the
inputs and one for the outputs. They contain one tuple with the
shape of the corresponding inputs/outputs.
.. function:: make_thunk(node, storage_map, compute_map, no_recycling)
TODO
......
......@@ -35,6 +35,16 @@ probably do something similar on older computer.
this is not completely safe. ``easy_install`` with NumPy 1.5.1 does not
raise this error.
.. note::
This page describe how to install Theano for python 2. If you have
installed python3 on your system, maybe you need to change the
command pip to ``pip-2.7`` to specify to install it for python2, as
sometimes the pip command refer to the python 3 version.
The development version of Theano support python 3.3 and we
probably support python 3.2, but don't test on it.
Installation steps
~~~~~~~~~~~~~~~~~~
......
......@@ -86,7 +86,10 @@ Op Contract
def R_op(self, inputs, eval_points):
pass
def infer_shape(node, (i0_shapes, ...))
def infer_shape(node, (i0_shapes, ...)):
pass
def flops(self, inputs, outputs):
pass
.. ../extending/op.txt
......@@ -116,6 +119,11 @@ The :func:`infer_shape` method allows to infer the shape of some variable, somew
middle of the computational graph without actually computing the outputs (when possible).
This could be helpful if one only needs the shape of the output instead of the actual outputs.
The :func:`flops` method allows to have the number of mega flops and
giga flops per second printed by the memory profiler. It take as
inputs two list: one for the inputs and one for the outputs. They
contain one tuple with the shape of the corresponding inputs/outputs.
The :func:`grad` method is required if you want to differentiate some cost whose expression
includes your op.
......
......@@ -292,32 +292,6 @@ class ProfileStats(object):
rval[node.op] = 'Py'
return rval
def op_flops(self):
"""dict op -> total number of flops"""
# timing is stored by node, we compute timing by Op on demand
rval = {}
return rval # TODO: continue here
for node, count in self.apply_callcount.items():
rval.setdefault(node.op, 0)
rval[node.op] += 1
return rval
for a, t in self.op_time.items():
if hasattr(a, 'flops'):
op_flops[a] = a.flops * op_call[a] / t / 1e6
flops_msg = ''
if op_flops:
flops_msg = ' <MFlops/s>'
print ('\nHACK WARNING: we print the flops for some OP, but the'
' logic does not always work. You need to know the internal'
' of Theano to make it work correctly.'
' Otherwise don\'t use!')
print ('\nOp-wise summary:'
' <%% of local_time spent on this kind of Op>'
' <cumulative %%> <self seconds> <cumulative seconds>'
' <time per call> %s <nb_call> <nb apply> <Op name>' % (
flops_msg))
def summary_class(self, file=sys.stderr, N=None):
if self.apply_time:
local_time = sum(self.apply_time.values())
......@@ -330,7 +304,6 @@ class ProfileStats(object):
class_time = self.class_time()
class_call = self.class_callcount()
class_apply = self.class_nodes()
# class_flops = self.class_flops()
class_impl = self.class_impl()
if N is None:
N = len(self.class_time)
......@@ -395,12 +368,6 @@ class ProfileStats(object):
# While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the
# profiler
#if op_flops:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
#else:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Classes account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
......@@ -419,10 +386,7 @@ class ProfileStats(object):
op_time = self.op_time()
op_call = self.op_callcount()
op_apply = self.op_nodes()
op_flops = self.op_flops()
op_impl = self.op_impl()
if N is None:
N = len(self.op_flops)
otimes = [(t * 100 / local_time,
t,
op,
......@@ -484,12 +448,6 @@ class ProfileStats(object):
# While this carries over less information, it is arranged such
# that it way more readeable that the previous output of the
# profiler
#if op_flops:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
#else:
# print >>file, ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (
# f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
print >>file, ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
% (max(0, len(otimes) - N),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
......@@ -532,6 +490,10 @@ class ProfileStats(object):
hs += ['<id>']
es += ['%3d']
es += ['%s', '%s']
if self.variable_shape:
hs += ['<Mflops>', '<Gflops/s>']
upto_length = numpy.sum([len(x) for x in hs]) + len(hs)
maxlen = self.line_width - upto_length
hs += ['<Apply name>']
......@@ -557,8 +519,22 @@ class ProfileStats(object):
ftot = tot * 100 / local_time
if nb_call == 0:
continue
if not self.variable_shape:
flops = ""
flops_s = ""
elif hasattr(a.op, 'flops'):
fl = a.op.flops([self.variable_shape[var]
for var in a.inputs],
[self.variable_shape[var]
for var in a.outputs])
flops = '%8.1f' % (fl/1024./1024)
flops_s = '%10.1f' % (fl/1024./1024/1024/t)
else:
flops = " "
flops_s = " "
print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
nd_id,
flops, flops_s,
str(a)[:maxlen])
if not config.profile_memory:
continue
......
......@@ -838,11 +838,17 @@ class VM_Linker(link.LocalLinker):
for k in storage_map:
compute_map[k] = [k.owner is None]
thunks = [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)
for node in order]
thunks = []
for node in order:
try:
thunks.append(node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling))
except Exception, e:
e.args = ("The following error happened while"
" compiling the node", node, "\n") + e.args
raise
for node, thunk in zip(order, thunks):
thunk.inputs = [storage_map[v] for v in node.inputs]
thunk.outputs = [storage_map[v] for v in node.outputs]
......
......@@ -621,6 +621,25 @@ class GpuConv(GpuOp):
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops"""
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
flops = 0
if self.out_mode == "valid":
# nb mul and add by output pixel
flops = kerns[2] * kerns[3] * 2
# nb flops by output image
flops *= out[2] * out[3]
# nb patch multiplied
flops *= images[1] * kerns[0] * images[0]
else:
flops = (images[0] * kerns[0] * images[1] *
kerns[2] * kerns[3] *
images[2] * images[3] * 2)
return flops
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
......
......@@ -14,8 +14,7 @@ import theano.ifelse
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
EquilibriumOptimizer)
Optimizer, toolbox, DestroyHandler)
from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import (
device_properties, gpu_eye,
......@@ -1199,12 +1198,10 @@ def local_inplace_ger(node):
# Also, need to make the gemm optimisation(step 70) happen before the fusion of
# elemwise(step 71)
optdb.register('InplaceGpuBlasOpt',
EquilibriumOptimizer([local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
],
failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5),
tensor.opt.in2out(local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
name="InplaceGpuBlasOpt"),
70.0, 'fast_run', 'inplace', 'gpu')
......
......@@ -869,5 +869,5 @@ def test_stack_rows_segfault_070312():
out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
nkern=1, bsize=1)
f = theano.function([], [], updates=[(out, op(img, kern))])
f = theano.function([], [], updates=[(out, op(img, kern))], mode=theano_mode)
f()
......@@ -147,7 +147,7 @@ import theano.scalar
from theano.tensor import basic as T
from theano.tensor.blas_headers import blas_header_text
from theano.tensor.blas_headers import blas_header_version
from theano.tensor.opt import local_dimshuffle_lift
from theano.tensor.opt import local_dimshuffle_lift, in2out
_logger = logging.getLogger('theano.tensor.blas')
......@@ -1777,10 +1777,10 @@ blas_optdb.register('local_gemm_to_gemv',
# Try to make gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the
# fusion of elemwise(step 71)
blas_opt_inplace = EquilibriumOptimizer(
[local_inplace_gemm, local_inplace_gemv, local_inplace_ger],
failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5)
blas_opt_inplace = in2out(local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
name="blas_opt_inplace")
optdb.register('InplaceBlasOpt',
blas_opt_inplace,
70.0, 'fast_run', 'inplace')
......
......@@ -537,8 +537,6 @@ class ConvOp(OpenMPOp):
time_unroll_batch_kern)
self._rehash()
if config.op.set_flops:
self.set_flops()
def __eq__(self, other):
if type(self) != type(other):
......@@ -567,43 +565,24 @@ class ConvOp(OpenMPOp):
return "ConvOp{" + ",".join(str((a, getattr(self, a)))
for a in self.__attrnames) + "}"
def set_flops(self):
def flops(self, inputs, outputs):
""" Useful with the hack in profilemode to print the MFlops"""
images, kerns = inputs
out, = outputs
assert images[1] == kerns[1]
flops = 0
if self.out_mode == "valid":
# nb mul and add by output pixed
self.flops = self.kshp[0] * self.kshp[1] * 2
# nb mul and add by output pixel
flops = kerns[2] * kerns[3] * 2
#nb flops by output image
self.flops *= self.outshp[0] * self.outshp[1]
# for all outputs images#n_stack==self.imshp[0]
self.flops *= self.imshp[0] * self.nkern * self.bsize
else: # full mode not implemented
self.flops = 0
for out_row in xrange(self.outshp[0]): # loop over output row
for out_col in xrange(self.outshp[0]): # loop over output col
for row in xrange(self.kshp[0]): # loop over kern row
if (row + out_row - self.kshp[0] + 1 < 0 or
row + out_row - self.kshp[0] + 1 >= self.imshp[1]):
continue
col = 0
max_col = self.kshp[1]
img_col = out_col - self.kshp[1] + 1
max_col = min(max_col, self.imshp[2] - img_col)
if img_col < 0:
col = -img_col
img_col += col
while col < max_col: # loop over kern col
self.flops += 2
col += 1
# for all outputs images#n_stack==self.imshp[0]
self.flops *= self.imshp[0] * self.nkern * self.bsize
assert self.flops == self.bsize * self.nkern * self.imshp[0] * \
self.kshp[0] * self.kshp[1] * \
self.imshp[1] * self.imshp[2] * 2
flops *= out[2] * out[3]
# nb patch multiplied
flops *= images[1] * kerns[0] * images[0]
else:
flops = (images[0] * kerns[0] * images[1] *
kerns[2] * kerns[3] *
images[2] * images[3] * 2)
return flops
def make_node(self, inputs, kerns):
# TODO: find a way to make ConvOp work for N-D (after NIPS09)
......@@ -917,9 +896,6 @@ class ConvOp(OpenMPOp):
version=self.version,
verbose=self.verbose)
if hasattr(self, 'flops'):
dw.set_flops()
dw = dw(img, filters)
if all_shape:
......@@ -966,9 +942,6 @@ class ConvOp(OpenMPOp):
version=-1, # we we change the mode, we don't forward the version.
verbose=self.verbose)
if hasattr(self, 'flops'):
din.set_flops()
din = din(gz, filters)
assert (din.owner.op.outshp is None and self.imshp is None) or \
......
......@@ -52,7 +52,7 @@ def out2in(*local_opts, **kwargs):
name = (kwargs and kwargs.pop('name', None))
if len(local_opts) > 1:
# Don't wrap it uselessly if their is only 1 optimization.
local_opts = opt.LocalOptGroup(*local_opts),
local_opts = opt.LocalOptGroup(*local_opts)
else:
local_opts, = local_opts
if not name:
......@@ -71,7 +71,7 @@ def in2out(*local_opts, **kwargs):
name = (kwargs and kwargs.pop('name', None))
if len(local_opts) > 1:
# Don't wrap it uselessly if their is only 1 optimization.
local_opts = opt.LocalOptGroup(*local_opts),
local_opts = opt.LocalOptGroup(*local_opts)
else:
local_opts, = local_opts
if not name:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论