提交 8c58dfb8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4463 from nouiz/scan_reintroduced_benchmark

Scan reintroduced benchmark
...@@ -239,6 +239,14 @@ import theano and print the config variable, as in: ...@@ -239,6 +239,14 @@ import theano and print the config variable, as in:
``False``, then we will gc the inner of scan after all ``False``, then we will gc the inner of scan after all
iterations. This is the default. iterations. This is the default.
.. attribute:: config.scan.debug
Bool value, either ``True`` or ``False``
Default: ``False``
If True, we will print extra scan debug information.
.. attribute:: openmp .. attribute:: openmp
Bool value: either True or False Bool value: either True or False
...@@ -995,3 +1003,17 @@ import theano and print the config variable, as in: ...@@ -995,3 +1003,17 @@ import theano and print the config variable, as in:
Bool value, default: False Bool value, default: False
If set to True, will preload the C module cache at import time If set to True, will preload the C module cache at import time
.. attribute:: config.traceback.limit
Int value, default: 8
The number of user stack level to keep for variables.
.. attribute:: config.traceback.compile_limit
Bool value, default: 0
The number of user stack level to keep for variables during Theano
compilation. If higher then 0, will make us keep Theano internal
stack trace.
...@@ -1492,7 +1492,7 @@ class FunctionMaker(object): ...@@ -1492,7 +1492,7 @@ class FunctionMaker(object):
# optimize the fgraph # optimize the fgraph
theano.config.compute_test_value = \ theano.config.compute_test_value = \
theano.config.compute_test_value_opt theano.config.compute_test_value_opt
theano.config.traceback.limit = 0 theano.config.traceback.limit = theano.config.traceback.compile_limit
start_optimizer = time.time() start_optimizer = time.time()
# now optimize the graph # now optimize the graph
...@@ -1683,7 +1683,7 @@ class FunctionMaker(object): ...@@ -1683,7 +1683,7 @@ class FunctionMaker(object):
start_import_time = theano.gof.cmodule.import_time start_import_time = theano.gof.cmodule.import_time
limit_orig = theano.config.traceback.limit limit_orig = theano.config.traceback.limit
try: try:
theano.config.traceback.limit = 0 theano.config.traceback.limit = theano.config.traceback.compile_limit
_fn, _i, _o = self.linker.make_thunk( _fn, _i, _o = self.linker.make_thunk(
input_storage=input_storage_lists, storage_map=storage_map) input_storage=input_storage_lists, storage_map=storage_map)
finally: finally:
......
...@@ -573,6 +573,17 @@ AddConfigVar( ...@@ -573,6 +573,17 @@ AddConfigVar(
IntParam(8), IntParam(8),
in_c_key=False) in_c_key=False)
AddConfigVar(
'traceback.compile_limit',
"The number of stack to trace to keep during compilation. -1 mean all."
" If greater then 0, will also make us save Theano internal stack trace.",
IntParam(0),
in_c_key=False)
AddConfigVar('experimental.mrg',
"Another random number generator that work on the gpu",
BoolParam(False))
AddConfigVar('experimental.unpickle_gpu_on_cpu', AddConfigVar('experimental.unpickle_gpu_on_cpu',
"Allow unpickling of pickled CudaNdarrays as numpy.ndarrays." "Allow unpickling of pickled CudaNdarrays as numpy.ndarrays."
"This is useful, if you want to open a CudaNdarray without " "This is useful, if you want to open a CudaNdarray without "
...@@ -1417,6 +1428,11 @@ AddConfigVar('scan.allow_output_prealloc', ...@@ -1417,6 +1428,11 @@ AddConfigVar('scan.allow_output_prealloc',
BoolParam(True), BoolParam(True),
in_c_key=False) in_c_key=False)
AddConfigVar('scan.debug',
"If True, enable extra verbose output related to scan",
BoolParam(False),
in_c_key=False)
AddConfigVar('pycuda.init', AddConfigVar('pycuda.init',
"""If True, always initialize PyCUDA when Theano want to """If True, always initialize PyCUDA when Theano want to
initilize the GPU. Currently, we must always initialize initilize the GPU. Currently, we must always initialize
......
...@@ -472,7 +472,7 @@ class FunctionGraph(utils.object2): ...@@ -472,7 +472,7 @@ class FunctionGraph(utils.object2):
self.execute_callbacks('on_change_input', node, i, self.execute_callbacks('on_change_input', node, i,
r, new_r, reason=reason) r, new_r, reason=reason)
if prune: if prune:
self.__remove_clients__(r, [], True) self.__remove_clients__(r, [], True, reason=reason)
# replace # # replace #
def replace(self, r, new_r, reason=None, verbose=None): def replace(self, r, new_r, reason=None, verbose=None):
......
...@@ -2553,7 +2553,7 @@ def pre_greedy_local_optimizer(list_optimizations, out): ...@@ -2553,7 +2553,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
for opt in list_opt: for opt in list_opt:
ret = opt.transform(node) ret = opt.transform(node)
if ret is not False and ret is not None: if ret is not False and ret is not None:
assert len(ret) == len(node.outputs) assert len(ret) == len(node.outputs), opt
for k, v in zip(node.outputs, ret): for k, v in zip(node.outputs, ret):
optimized_vars[k] = v optimized_vars[k] = v
results = ret results = ret
......
...@@ -304,6 +304,9 @@ class ReplaceValidate(History, Validator): ...@@ -304,6 +304,9 @@ class ReplaceValidate(History, Validator):
chk = fgraph.checkpoint() chk = fgraph.checkpoint()
if verbose is None: if verbose is None:
verbose = config.optimizer_verbose verbose = config.optimizer_verbose
if config.scan.debug:
scans = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
for r, new_r in replacements: for r, new_r in replacements:
try: try:
fgraph.replace(r, new_r, reason=reason, verbose=False) fgraph.replace(r, new_r, reason=reason, verbose=False)
...@@ -337,6 +340,14 @@ class ReplaceValidate(History, Validator): ...@@ -337,6 +340,14 @@ class ReplaceValidate(History, Validator):
if verbose: if verbose:
print("validate failed on node %s.\n Reason: %s, %s" % (r, reason, e)) print("validate failed on node %s.\n Reason: %s, %s" % (r, reason, e))
raise raise
if config.scan.debug:
scans2 = [n for n in fgraph.apply_nodes if isinstance(n.op, theano.scan_module.scan_op.Scan)]
nb = len(scans)
nb2 = len(scans2)
if nb2 > nb:
print("Extra scan introduced", nb, nb2, getattr(reason, 'name', reason), r, new_r)
elif nb2 < nb:
print("Scan removed", nb, nb2, getattr(reason, 'name', reason), r, new_r)
if verbose: if verbose:
print(reason, r, new_r) print(reason, r, new_r)
# The return is needed by replace_all_validate_remove # The return is needed by replace_all_validate_remove
......
...@@ -102,6 +102,9 @@ def add_tag_trace(thing, user_line=None): ...@@ -102,6 +102,9 @@ def add_tag_trace(thing, user_line=None):
"theano/sparse/", "theano\\sparse\\", "theano/sparse/", "theano\\sparse\\",
"theano/typed_list/", "theano\\typed_list\\"] "theano/typed_list/", "theano\\typed_list\\"]
if config.traceback.compile_limit > 0:
skips = []
tr = simple_extract_stack(limit=user_line, skips=skips) tr = simple_extract_stack(limit=user_line, skips=skips)
# Different python version use different sementic for # Different python version use different sementic for
# limit. python 2.7 include the call to extrack_stack. The -1 get # limit. python 2.7 include the call to extrack_stack. The -1 get
......
...@@ -42,10 +42,11 @@ register_transfer(transfer) ...@@ -42,10 +42,11 @@ register_transfer(transfer)
def init_dev(dev, name=None): def init_dev(dev, name=None):
v = pygpu.gpuarray.api_version() v = pygpu.gpuarray.api_version()
if v[0] != -9998: expected = -9998
if v[0] != expected:
raise RuntimeError("Wrong major API version for gpuarray:", v[0], raise RuntimeError("Wrong major API version for gpuarray:", v[0],
"Make sure Theano and libgpuarray/pygpu " "Make sure Theano and libgpuarray/pygpu "
"are in sync.") "are in sync. Expected", expected)
if v[1] < 0: if v[1] < 0:
raise RuntimeError("Wrong minor API version for gpuarray:", v[1], raise RuntimeError("Wrong minor API version for gpuarray:", v[1],
"Please update libgpuarray/pygpu.") "Please update libgpuarray/pygpu.")
......
...@@ -159,6 +159,36 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -159,6 +159,36 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
a = "gemm (2)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
a = "direct (3)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
a = "fft (4)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
a = "fft tiling (5)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
a = "winograd (6)";
break;
#endif
}
printf("GpuDNNConv: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides, // The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support // The tiled-FFT implementation (only in V4 onward) does not support
......
...@@ -158,6 +158,30 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -158,6 +158,30 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
a = "fft tiling (3)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
a = "winograd (4)";
break;
#endif
}
printf("GpuDNNConvGI: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides, // The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support // The tiled-FFT implementation (only in V4 onward) does not support
......
...@@ -158,6 +158,25 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -158,6 +158,25 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
a = "algo 0 (0)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
a = "algo 1 (1)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
a = "algo 3 (3)";
break;
}
printf("GpuDNNConvGW: algo %s\n", a);
}
// The FFT implementation (only in v3 and onward) does not support strides, // The FFT implementation (only in v3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used // If the chosen implementation is FFT, validate that it can be used
......
...@@ -4464,14 +4464,15 @@ class Reshape(Op): ...@@ -4464,14 +4464,15 @@ class Reshape(Op):
return [requ] return [requ]
else: else:
new_dims = [node.inputs[1][i] for i in xrange(self.ndim)] new_dims = [node.inputs[1][i] for i in xrange(self.ndim)]
# since new_dims has one negative value (-1), the # since new_dims can have negative value (-1), the
# multiplication of all values should be negated # multiplication of all values should be negated
# to give a positive value. # to give a positive value.
# To avoid optimization complexity, we avoid checking # To avoid optimization complexity, we avoid checking
# for the case when there are two or more '-1' values. # for the case when there are two or more '-1' values.
if self.ndim:
rest_size = (mul(*ishapes[0]) // -mul(*new_dims))
return [tuple([switch(eq(new_dims[i], -1), return [tuple([switch(eq(new_dims[i], -1),
theano.tensor.mul(*ishapes[0]) // rest_size,
(-theano.tensor.mul(*new_dims)),
new_dims[i]) new_dims[i])
for i in xrange(self.ndim)])] for i in xrange(self.ndim)])]
......
...@@ -1512,8 +1512,8 @@ def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP): ...@@ -1512,8 +1512,8 @@ def local_elemwise_alloc_op(ElemwiseOP, AllocOP, DimShuffleOP):
# when i.owner.inputs[0].type == i.owner.outputs[0].type we # when i.owner.inputs[0].type == i.owner.outputs[0].type we
# will remove that alloc later # will remove that alloc later
assert i.type.ndim == cmp_op.ndim assert i.type.ndim == cmp_op.ndim
get_shape = node.fgraph.shape_feature.get_shape
if theano.config.experimental.local_alloc_elemwise_assert: if theano.config.experimental.local_alloc_elemwise_assert:
get_shape = node.fgraph.shape_feature.get_shape
cond = [] cond = []
for idx in xrange(i.type.ndim): for idx in xrange(i.type.ndim):
if (not i.type.broadcastable[idx] and if (not i.type.broadcastable[idx] and
...@@ -1731,7 +1731,7 @@ compile.optdb.register('local_alloc_empty_to_zeros', ...@@ -1731,7 +1731,7 @@ compile.optdb.register('local_alloc_empty_to_zeros',
@register_specialize @register_specialize
@register_canonicalize @register_canonicalize
@gof.local_optimizer([T.shape]) @gof.local_optimizer([T.Shape])
def local_shape_to_shape_i(node): def local_shape_to_shape_i(node):
if node.op == T.shape: if node.op == T.shape:
# This optimization needs ShapeOpt and fgraph.shape_feature # This optimization needs ShapeOpt and fgraph.shape_feature
...@@ -4759,6 +4759,10 @@ def local_useless_elemwise_comparison(node): ...@@ -4759,6 +4759,10 @@ def local_useless_elemwise_comparison(node):
Elemwise[LT](add([anything that is shapes]), 0) -> Elemwise[zeros](X) Elemwise[LT](add([anything that is shapes]), 0) -> Elemwise[zeros](X)
Elemwise[GE](add([anything that is shapes]), 0) -> Elemwise[ones](X) Elemwise[GE](add([anything that is shapes]), 0) -> Elemwise[ones](X)
# Shapes are never negative
# Needed by Reshape.infer_shape
Elemwise[EQ](Subtensor(Shape(x)), -N) -> Elemwise[zeros](X)
""" """
if not isinstance(node.op, T.Elemwise): if not isinstance(node.op, T.Elemwise):
return return
...@@ -4834,6 +4838,41 @@ def local_useless_elemwise_comparison(node): ...@@ -4834,6 +4838,41 @@ def local_useless_elemwise_comparison(node):
T.extract_constant(node.inputs[1], only_process_constants=True) == 0: T.extract_constant(node.inputs[1], only_process_constants=True) == 0:
return [T.ones_like(node.inputs[0], dtype=node.outputs[0].dtype)] return [T.ones_like(node.inputs[0], dtype=node.outputs[0].dtype)]
# Elemwise[EQ](Subtensor(Shape(x)), -N)
# Elemwise[EQ](somegraph that only depend of shape, -N)
# TODO: handle the case where the -N is on either side
"""
|Elemwise{eq,no_inplace} [id B] ''
| |Subtensor{int64} [id C] ''
| | |Join [id D] ''
| | | |TensorConstant{0} [id E]
| | | |Subtensor{int64:int64:} [id F] ''
| | | | |Shape [id G] ''
"""
def investigate(node):
" Return True if values will be shapes, so >= 0"
if isinstance(node.op, (T.Shape, Shape_i)):
return True
elif isinstance(node.op, Subtensor) and node.inputs[0].owner:
return investigate(node.inputs[0].owner)
elif isinstance(node.op, T.Join):
return all(v.owner and
investigate(v.owner) for v in node.inputs[1:])
elif isinstance(node.op, MakeVector):
return all(v.owner and
investigate(v.owner) for v in node.inputs)
if (isinstance(node.op.scalar_op, scalar.EQ) and
node.inputs[0].owner and
investigate(node.inputs[0].owner)):
try:
cst = get_scalar_constant_value(node.inputs[1],
only_process_constants=True)
if cst < 0:
return [T.zeros_like(node.inputs[0],
dtype=node.outputs[0].dtype)]
except NotScalarConstantError:
pass
return return
......
...@@ -3409,7 +3409,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase): ...@@ -3409,7 +3409,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
sequences=[X], sequences=[X],
non_sequences=None) non_sequences=None)
Z = X_sum + Y Z = X_sum + Y
theano.printing.debugprint(Z) # theano.printing.debugprint(Z)
# here is the output for the debug print: # here is the output for the debug print:
""" """
Elemwise{add,no_inplace} [id A] '' Elemwise{add,no_inplace} [id A] ''
...@@ -3436,7 +3436,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase): ...@@ -3436,7 +3436,7 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
mode = theano.compile.get_default_mode().excluding('fusion') mode = theano.compile.get_default_mode().excluding('fusion')
f = theano.function([X, Y], Z, mode=mode) f = theano.function([X, Y], Z, mode=mode)
theano.printing.debugprint(f, print_type=True) # theano.printing.debugprint(f, print_type=True)
# here is the output for the debug print: # here is the output for the debug print:
""" """
Elemwise{Add}[(0, 0)] [id A] <TensorType(float64, vector)> '' 7 Elemwise{Add}[(0, 0)] [id A] <TensorType(float64, vector)> '' 7
...@@ -3465,14 +3465,19 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase): ...@@ -3465,14 +3465,19 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
> |X[t] [id O] <TensorType(float64, vector)> -> [id E] > |X[t] [id O] <TensorType(float64, vector)> -> [id E]
""" """
def assert_eqs_const(self, f, val): def assert_eqs_const(self, f, val, op=deep_copy_op):
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
elem = topo[0] elem = topo[0]
assert len(topo) == 1, topo assert len(topo) == 1, topo
assert elem.op == deep_copy_op, elem.op assert elem.op == op, elem.op
assert len(elem.inputs) == 1, elem.inputs if op == deep_copy_op:
assert isinstance(elem.inputs[0], T.TensorConstant), elem assert len(elem.inputs) == 1, elem.inputs
assert T.extract_constant(elem.inputs[0]) == val, val assert isinstance(elem.inputs[0], T.TensorConstant), elem
assert T.extract_constant(elem.inputs[0]) == val, val
else:
assert len(elem.inputs) == 2, elem.inputs
assert isinstance(elem.inputs[0], T.TensorConstant), elem
assert T.extract_constant(elem.inputs[0]) == val, val
def assert_identity(self, f): def assert_identity(self, f):
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
...@@ -3552,6 +3557,33 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase): ...@@ -3552,6 +3557,33 @@ class Test_local_useless_elemwise_comparison(unittest.TestCase):
f = theano.function([x, y], T.ge(x.shape[0]+y.shape[0], 0), mode=mode) f = theano.function([x, y], T.ge(x.shape[0]+y.shape[0], 0), mode=mode)
self.assert_eqs_const(f, 1) self.assert_eqs_const(f, 1)
def test_equality_shapes(self):
# Test equality where one sides contain only shapes related
# stuff.
if theano.config.mode == "FAST_COMPILE":
raise SkipTest("Skip opt test as the opt is disabled")
x = T.vector('x', dtype=config.floatX)
for g in [x.shape[0],
Shape_i(0)(x)]:
f = theano.function([x], T.eq(g, 0))
assert f([3, 3]) == 0
assert f([]) == 1
f = theano.function([x], T.eq(g, -1))
self.assert_eqs_const(f, 0)
assert f([3, 3]) == 0
g = join(0,
x.shape[0:], # todo test reshape, dimshuffle
x.shape[0:1])
f = theano.function([x], T.eq(g, 0))
assert (f([3, 3]) == 0).all()
assert (f([]) == 1).all()
f = theano.function([x], T.eq(g, -1))
self.assert_eqs_const(f, 0, op=T.alloc)
assert (f([3, 3]) == 0).all()
def test_and(self): def test_and(self):
mode = theano.compile.get_default_mode().including('canonicalize') mode = theano.compile.get_default_mode().including('canonicalize')
......
...@@ -908,8 +908,9 @@ class TensorConstant(_tensor_py_operators, Constant): ...@@ -908,8 +908,9 @@ class TensorConstant(_tensor_py_operators, Constant):
return TensorConstantSignature((self.type, self.data)) return TensorConstantSignature((self.type, self.data))
def equals(self, other): def equals(self, other):
# Override Contant.equals to allow to compare with numpy.ndarray # Override Contant.equals to allow to compare with
if isinstance(other, numpy.ndarray): # numpy.ndarray, and python type.
if isinstance(other, (numpy.ndarray, int, float)):
# Make a TensorConstant to be able to compare # Make a TensorConstant to be able to compare
other = theano.tensor.basic.constant(other) other = theano.tensor.basic.constant(other)
return (isinstance(other, TensorConstant) and return (isinstance(other, TensorConstant) and
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论