提交 8c58dfb8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4463 from nouiz/scan_reintroduced_benchmark

Scan reintroduced benchmark
......@@ -239,6 +239,14 @@ import theano and print the config variable, as in:
``False``, then we will gc the inner of scan after all
iterations. This is the default.
.. attribute:: config.scan.debug
Bool value, either ``True`` or ``False``
Default: ``False``
If True, we will print extra scan debug information.
.. attribute:: openmp
Bool value: either True or False
......@@ -995,3 +1003,17 @@ import theano and print the config variable, as in:
Bool value, default: False
If set to True, will preload the C module cache at import time
.. attribute:: config.traceback.limit
Int value, default: 8
The number of user stack level to keep for variables.
.. attribute:: config.traceback.compile_limit
Bool value, default: 0
The number of user stack level to keep for variables during Theano
compilation. If higher then 0, will make us keep Theano internal
stack trace.
......@@ -1492,7 +1492,7 @@ class FunctionMaker(object):
# optimize the fgraph
theano.config.compute_test_value = \
theano.config.compute_test_value_opt
theano.config.traceback.limit = 0
theano.config.traceback.limit = theano.config.traceback.compile_limit
start_optimizer = time.time()
# now optimize the graph
......@@ -1683,7 +1683,7 @@ class FunctionMaker(object):
start_import_time = theano.gof.cmodule.import_time
limit_orig = theano.config.traceback.limit
try:
theano.config.traceback.limit = 0
theano.config.traceback.limit = theano.config.traceback.compile_limit
_fn, _i, _o = self.linker.make_thunk(
input_storage=input_storage_lists, storage_map=storage_map)
finally:
......
......@@ -573,6 +573,17 @@ AddConfigVar(
IntParam(8),
in_c_key=False)
AddConfigVar(
'traceback.compile_limit',
"The number of stack to trace to keep during compilation. -1 mean all."
" If greater then 0, will also make us save Theano internal stack trace.",
IntParam(0),
in_c_key=False)
AddConfigVar('experimental.mrg',
"Another random number generator that work on the gpu",
BoolParam(False))
AddConfigVar('experimental.unpickle_gpu_on_cpu',
"Allow unpickling of pickled CudaNdarrays as numpy.ndarrays."
"This is useful, if you want to open a CudaNdarray without "
......@@ -1417,6 +1428,11 @@ AddConfigVar('scan.allow_output_prealloc',
BoolParam(True),
in_c_key=False)
AddConfigVar('scan.debug',
"If True, enable extra verbose output related to scan",
BoolParam(False),
in_c_key=False)
AddConfigVar('pycuda.init',
"""If True, always initialize PyCUDA when Theano want to
initilize the GPU. Currently, we must always initialize
......
......@@ -472,7 +472,7 @@ class FunctionGraph(utils.object2):
self.execute_callbacks('on_change_input', node, i,
r, new_r, reason=reason)
if prune:
self.__remove_clients__(r, [], True)
self.__remove_clients__(r, [], True, reason=reason)
# replace #
def replace(self, r, new_r, reason=None, verbose=None):
......
......@@ -2553,7 +2553,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
for opt in list_opt:
ret = opt.transform(node)
if ret is not False and ret is not None:
assert len(ret) == len(node.outputs)
assert len(ret) == len(node.outputs), opt
for k, v in zip(node.outputs, ret):
optimized_vars[k] = v
results = ret
......
......@@ -304,6 +304,9 @@ class ReplaceValidate(History, Validator):
chk = fgraph.checkpoint()
if verbose is None:
verbose = config.optimizer_verbose
if config.scan.debug:
scans = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
for r, new_r in replacements:
try:
fgraph.replace(r, new_r, reason=reason, verbose=False)
......@@ -337,6 +340,14 @@ class ReplaceValidate(History, Validator):
if verbose:
print("validate failed on node %s.\n Reason: %s, %s" % (r, reason, e))
raise
if config.scan.debug:
scans2 = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
nb = len(scans)
nb2 = len(scans2)
if nb2 > nb:
print("Extra scan introduced", nb, nb2, getattr(reason, 'name', reason), r, new_r)
elif nb2 < nb:
print("Scan removed", nb, nb2, getattr(reason, 'name', reason), r, new_r)
if verbose:
print(reason, r, new_r)
# The return is needed by replace_all_validate_remove
......
......@@ -102,6 +102,9 @@ def add_tag_trace(thing, user_line=None):
"theano/sparse/", "theano\\sparse\\",
"theano/typed_list/", "theano\\typed_list\\"]
if config.traceback.compile_limit > 0:
skips = []
tr = simple_extract_stack(limit=user_line, skips=skips)
# Different python version use different sementic for
# limit. python 2.7 include the call to extrack_stack. The -1 get
......
......@@ -42,10 +42,11 @@ register_transfer(transfer)
def init_dev(dev, name=None):
v = pygpu.gpuarray.api_version()
if v[0] != -9998:
expected = -9998
if v[0] != expected:
raise RuntimeError("Wrong major API version for gpuarray:", v[0],
"Make sure Theano and libgpuarray/pygpu "
"are in sync.")
"are in sync. Expected", expected)
if v[1] < 0:
raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
"Please update libgpuarray/pygpu.")
......
......@@ -159,6 +159,36 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
a = "gemm (2)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
a = "direct (3)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
a = "fft (4)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
a = "fft tiling (5)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
a = "winograd (6)";
break;
#endif
}
printf("GpuDNNConv: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support
......
......@@ -158,6 +158,30 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
a = "fft tiling (3)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
a = "winograd (4)";
break;
#endif
}
printf("GpuDNNConvGI: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support
......
......@@ -158,6 +158,25 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
a = "algo 0 (0)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
a = "algo 1 (1)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
a = "algo 3 (3)";
break;
}
printf("GpuDNNConvGW: algo %s\n", a);
}
// The FFT implementation (only in v3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
......
......@@ -4464,14 +4464,15 @@ class Reshape(Op):
return [requ]
else:
new_dims = [node.inputs[1][i] for i in xrange(self.ndim)]
# since new_dims has one negative value (-1), the
# since new_dims can have negative value (-1), the
# multiplication of all values should be negated
# to give a positive value.
# To avoid optimization complexity, we avoid checking
# for the case when there are two or more '-1' values.
if self.ndim:
rest_size = (mul(*ishapes[0]) // -mul(*new_dims))
return [tuple([switch(eq(new_dims[i], -1),
theano.tensor.mul(*ishapes[0]) //
(-theano.tensor.mul(*new_dims)),
rest_size,
new_dims[i])
for i in xrange(self.ndim)])]
......
......@@ -1512,8 +1512,8 @@ def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
# when i.owner.inputs[0].type == i.owner.outputs[0].type we
# will remove that alloc later
assert i.type.ndim == cmp_op.ndim
get_shape = node.fgraph.shape_feature.get_shape
if theano.config.experimental.local_alloc_elemwise_assert:
get_shape = node.fgraph.shape_feature.get_shape
cond = []
for idx in xrange(i.type.ndim):
if (not i.type.broadcastable[idx] and
......@@ -1731,7 +1731,7 @@ compile.optdb.register('local_alloc_empty_to_zeros',
@register_specialize
@register_canonicalize
@gof.local_optimizer([T.shape])
@gof.local_optimizer([T.Shape])
def local_shape_to_shape_i(node):
if node.op == T.shape:
# This optimization needs ShapeOpt and fgraph.shape_feature
......@@ -4759,6 +4759,10 @@ def local_useless_elemwise_comparison(node):
Elemwise[LT](add([anything that is shapes]), 0) -> Elemwise[zeros](X)
Elemwise[GE](add([anything that is shapes]), 0) -> Elemwise[ones](X)
# Shapes are never negative
# Needed by Reshape.infer_shape
Elemwise[EQ](Subtensor(Shape(x)), -N) -> Elemwise[zeros](X)
"""
if not isinstance(node.op, T.Elemwise):
return
......@@ -4834,6 +4838,41 @@ def local_useless_elemwise_comparison(node):
T.extract_constant(node.inputs[1], only_process_constants=True) == 0:
return [T.ones_like(node.inputs[0], dtype=node.outputs[0].dtype)]
# Elemwise[EQ](Subtensor(Shape(x)), -N)
# Elemwise[EQ](somegraph that only depend of shape, -N)
# TODO: handle the case where the -N is on either side
"""
|Elemwise{eq,no_inplace} [id B] ''
| |Subtensor{int64} [id C] ''
| | |Join [id D] ''
| | | |TensorConstant{0} [id E]
| | | |Subtensor{int64:int64:} [id F] ''
| | | | |Shape [id G] ''
"""
def investigate(node):
" Return True if values will be shapes, so >= 0"
if isinstance(node.op, (T.Shape, Shape_i)):
return True
elif isinstance(node.op, Subtensor) and node.inputs[0].owner:
return investigate(node.inputs[0].owner)
elif isinstance(node.op, T.Join):
return all(v.owner and
investigate(v.owner) for v in node.inputs[1:])
elif isinstance(node.op, MakeVector):
return all(v.owner and
investigate(v.owner) for v in node.inputs)
if (isinstance(node.op.scalar_op, scalar.EQ) and
node.inputs[0].owner and
investigate(node.inputs[0].owner)):
try:
cst = get_scalar_constant_value(node.inputs[1],
only_process_constants=True)
if cst < 0:
return [T.zeros_like(node.inputs[0],
dtype=node.outputs[0].dtype)]
except NotScalarConstantError:
pass
return
......
......@@ -3409,7 +3409,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
sequences=[X],
non_sequences=None)
Z = X_sum + Y
theano.printing.debugprint(Z)
# theano.printing.debugprint(Z)
# here is the output for the debug print:
"""
Elemwise{add,no_inplace} [id A] ''
......@@ -3436,7 +3436,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
mode = theano.compile.get_default_mode().excluding('fusion')
f = theano.function([X, Y], Z, mode=mode)
theano.printing.debugprint(f, print_type=True)
# theano.printing.debugprint(f, print_type=True)
# here is the output for the debug print:
"""
Elemwise{Add}[(0, 0)] [id A] <TensorType(float64, vector)> '' 7
......@@ -3465,14 +3465,19 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
> |X[t] [id O] <TensorType(float64, vector)> -> [id E]
"""
def assert_eqs_const(self, f, val):
def assert_eqs_const(self, f, val, op=deep_copy_op):
topo = f.maker.fgraph.toposort()
elem = topo[0]
assert len(topo) == 1, topo
assert elem.op == deep_copy_op, elem.op
assert len(elem.inputs) == 1, elem.inputs
assert isinstance(elem.inputs[0], T.TensorConstant), elem
assert T.extract_constant(elem.inputs[0]) == val, val
assert elem.op == op, elem.op
if op == deep_copy_op:
assert len(elem.inputs) == 1, elem.inputs
assert isinstance(elem.inputs[0], T.TensorConstant), elem
assert T.extract_constant(elem.inputs[0]) == val, val
else:
assert len(elem.inputs) == 2, elem.inputs
assert isinstance(elem.inputs[0], T.TensorConstant), elem
assert T.extract_constant(elem.inputs[0]) == val, val
def assert_identity(self, f):
topo = f.maker.fgraph.toposort()
......@@ -3552,6 +3557,33 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
f = theano.function([x, y], T.ge(x.shape[0]+y.shape[0], 0), mode=mode)
self.assert_eqs_const(f, 1)
def test_equality_shapes(self):
# Test equality where one sides contain only shapes related
# stuff.
if theano.config.mode == "FAST_COMPILE":
raise SkipTest("Skip opt test as the opt is disabled")
x = T.vector('x', dtype=config.floatX)
for g in [x.shape[0],
Shape_i(0)(x)]:
f = theano.function([x], T.eq(g, 0))
assert f([3, 3]) == 0
assert f([]) == 1
f = theano.function([x], T.eq(g, -1))
self.assert_eqs_const(f, 0)
assert f([3, 3]) == 0
g = join(0,
x.shape[0:], # todo test reshape, dimshuffle
x.shape[0:1])
f = theano.function([x], T.eq(g, 0))
assert (f([3, 3]) == 0).all()
assert (f([]) == 1).all()
f = theano.function([x], T.eq(g, -1))
self.assert_eqs_const(f, 0, op=T.alloc)
assert (f([3, 3]) == 0).all()
def test_and(self):
mode = theano.compile.get_default_mode().including('canonicalize')
......
......@@ -908,8 +908,9 @@ class TensorConstant(_tensor_py_operators, Constant):
return TensorConstantSignature((self.type, self.data))
def equals(self, other):
# Override Contant.equals to allow to compare with numpy.ndarray
if isinstance(other, numpy.ndarray):
# Override Contant.equals to allow to compare with
# numpy.ndarray, and python type.
if isinstance(other, (numpy.ndarray, int, float)):
# Make a TensorConstant to be able to compare
other = theano.tensor.basic.constant(other)
return (isinstance(other, TensorConstant) and
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论