提交 f0bd940e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3477 from nouiz/crash_gpu

Crash gpu and opt speed up
......@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following:
Nothing happened here. The reason is: ``add(y, z) != add(y,
z)``. That is the case for efficiency reasons. To fix this problem we
first need to merge the parts of the graph that represent the same
computation, using the ``merge_optimizer`` defined in
computation, using the ``MergeOptimizer`` defined in
``theano.gof.opt``.
>>> from theano.gof.opt import merge_optimizer
>>> merge_optimizer.optimize(e) # doctest: +ELLIPSIS
>>> from theano.gof.opt import MergeOptimizer
>>> MergeOptimizer().optimize(e) # doctest: +ELLIPSIS
(0, ..., None, None, {}, 1, 0)
>>> e
[true_div(mul(*1 -> add(y, z), x), *1)]
......
......@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
# rearranges elemwise expressions
optdb.register('canonicalize', gof.EquilibriumDB(),
optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
1, 'fast_run', 'fast_compile')
# Register in the canonizer Equilibrium as a clean up opt the merge opt.
# Without this, as the equilibrium have ignore_newtrees=False, we
# won't merge all nodes if it is set as a global optimizer with
# final_opt=True.
# We need a new instance of MergeOptimizer to don't have its name
# changed by other usage of it.
optdb['canonicalize'].register("merge", gof.opt.MergeOptimizer(), 'fast_run',
"fast_compile", cleanup=True)
optdb.register('merge1.2', gof.MergeOptimizer(),
1.2, 'fast_run', 'fast_compile', 'merge')
......
......@@ -547,6 +547,7 @@ class CLinker(link.Linker):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(self.schedule).accept(fgraph, no_recycling)
self.fgraph = fgraph
self.fetch_variables()
......@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker):
if no_recycling is None:
no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(
fallback_on_perform=self.fallback_on_perform,
allow_gc=self.allow_gc,
nice_errors=self.nice_errors,
schedule=self.schedule,
).accept(fgraph, no_recycling)
# raise Exception("Cannot accept from a Linker that is
# already tied to another FunctionGraph.")
self.fgraph = fgraph
self.no_recycling = no_recycling
return self
......
差异被折叠。
......@@ -268,28 +268,35 @@ class EquilibriumDB(DB):
super(EquilibriumDB, self).__init__()
self.ignore_newtrees = ignore_newtrees
self.__final__ = {}
self.__cleanup__ = {}
def register(self, name, obj, *tags, **kwtags):
if 'final_opt' in kwtags:
final_opt = kwtags['final_opt']
kwtags.pop('final_opt', None)
else:
final_opt = False
final_opt = kwtags.pop('final_opt', False)
cleanup = kwtags.pop('cleanup', False)
# An opt should not be final and clean up
assert not (final_opt and cleanup)
super(EquilibriumDB, self).register(name, obj, *tags, **kwtags)
self.__final__[name] = final_opt
self.__cleanup__[name] = cleanup
def query(self, *tags, **kwtags):
_opts = super(EquilibriumDB, self).query(*tags, **kwtags)
final_opts = [o for o in _opts if self.__final__.get(o.name, False)]
opts = [o for o in _opts if o not in final_opts]
cleanup_opts = [o for o in _opts if self.__cleanup__.get(o.name,
False)]
opts = [o for o in _opts
if o not in final_opts and o not in cleanup_opts]
if len(final_opts) == 0:
final_opts = None
if len(cleanup_opts) == 0:
cleanup_opts = None
return opt.EquilibriumOptimizer(
opts,
max_use_ratio=config.optdb.max_use_ratio,
ignore_newtrees=self.ignore_newtrees,
failure_callback=opt.NavigatorOptimizer.warn_inplace,
final_optimizers=final_opts)
final_optimizers=final_opts,
cleanup_optimizers=cleanup_opts)
class SequenceDB(DB):
......
......@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp):
const_shp = tensor.get_scalar_constant_value(s)
except tensor.NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
output = otype()
return sh, output
......
......@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor(
int nbDims,
const int dimA[],
const int strideA[]) {
if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetTensor4dDescriptorEx(
tensorDesc, dataType,
dimA[0], dimA[1], dimA[2], dimA[3],
......@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor(
int nbDims,
const int windowDimA[],
const int paddingA[],
const in strideA[]) {
const int strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode,
......@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor(
int strideA[]) {
int win0, win1, str0, str1;
cudnnStatus_t err;
if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err;
......
......@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad}
pad
(pad_h, pad_w) padding information.
(pad_h, pad_w) padding information.
pad_h is the number of zero-valued pixels added to each of the top and
bottom borders.
pad_w is the number of zero-valued pixels added to each of the left
......
......@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion',
'gpu')
# Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
gpu_optimizer.register('gpu_merge', theano.gof.opt.MergeOptimizer(),
'fast_run', 'fast_compile', final_opt=True)
......
......@@ -81,7 +81,7 @@ class CudaNdarrayType(Type):
raise TypeError('%s only supports dtype float32 for now. Tried '
'using dtype %s for variable %s' %
(self.__class__.__name__, dtype, name))
self.broadcastable = tuple(broadcastable)
self.broadcastable = tuple(bool(b) for b in broadcastable)
self.name = name
self.dtype_specs() # error checking is done there
......
......@@ -2673,7 +2673,7 @@ class Alloc(gof.Op):
const_shp = get_scalar_constant_value(s)
except NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
return sh, bcast
def make_node(self, value, *shape):
......@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op):
const_shp = get_scalar_constant_value(s)
except NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
bcast.append(1 == const_shp)
otype = TensorType(dtype=self.dtype, broadcastable=bcast)
output = otype()
return sh, output
......
差异被折叠。
......@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op):
raise TypeError()
# TODO: consider restricting the dtype?
x = tensor.as_tensor_variable(x)
return gof.Apply(self, [x], [x.type()])
# If the input shape are broadcastable we can have 0 in the output shape
broad = x.broadcastable[:2] + (False, False)
out = tensor.TensorType(x.dtype, broad)
return gof.Apply(self, [x], [out()])
def perform(self, node, inp, out):
x, = inp
......
......@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
[image_val, maxout_val, gz_val],
MaxPoolGrad,
warn=False)
# checking with broadcastable input
image = tensor.tensor(dtype='float64',
broadcastable=(False, False, True, True))
image_val = rng.rand(4, 6, 1, 1)
self._compile_and_check(
[image],
[DownsampleFactorMax((2, 2),
ignore_border=True,
padding=(0, 0))(image)],
[image_val], DownsampleFactorMax)
def test_opt_max_to_average(self):
im = theano.tensor.tensor4()
......
......@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase):
mode = compile.mode.get_default_mode()
opt = gof.Query(["canonicalize"])
opt = opt.including('ShapeOpt')
opt = opt.including('ShapeOpt', 'local_fill_to_alloc')
opt = opt.excluding(
'local_elemwise_fusion')
mode = mode.__class__(linker=mode.linker, optimizer=opt)
......@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase):
class T_useless_elemwise(unittest.TestCase):
def setUp(self):
self.mode = theano.compile.get_default_mode().including('canonicalize')
self.mode = theano.compile.get_default_mode().including(
'canonicalize', 'local_fill_to_alloc')
def test_eq(self):
x = T.dmatrix()
......@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the mul
f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val)))
......@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the sqr and neg
f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
assert len(f.maker.fgraph.apply_nodes) == 21, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val)))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论