提交 f0bd940e authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3477 from nouiz/crash_gpu

Crash gpu and opt speed up
...@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following: ...@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following:
Nothing happened here. The reason is: ``add(y, z) != add(y, Nothing happened here. The reason is: ``add(y, z) != add(y,
z)``. That is the case for efficiency reasons. To fix this problem we z)``. That is the case for efficiency reasons. To fix this problem we
first need to merge the parts of the graph that represent the same first need to merge the parts of the graph that represent the same
computation, using the ``merge_optimizer`` defined in computation, using the ``MergeOptimizer`` defined in
``theano.gof.opt``. ``theano.gof.opt``.
>>> from theano.gof.opt import merge_optimizer >>> from theano.gof.opt import MergeOptimizer
>>> merge_optimizer.optimize(e) # doctest: +ELLIPSIS >>> MergeOptimizer().optimize(e) # doctest: +ELLIPSIS
(0, ..., None, None, {}, 1, 0) (0, ..., None, None, {}, 1, 0)
>>> e >>> e
[true_div(mul(*1 -> add(y, z), x), *1)] [true_div(mul(*1 -> add(y, z), x), *1)]
......
...@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(), ...@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
# rearranges elemwise expressions # rearranges elemwise expressions
optdb.register('canonicalize', gof.EquilibriumDB(), optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
1, 'fast_run', 'fast_compile') 1, 'fast_run', 'fast_compile')
# Register in the canonizer Equilibrium as a clean up opt the merge opt.
# Without this, as the equilibrium have ignore_newtrees=False, we
# won't merge all nodes if it is set as a global optimizer with
# final_opt=True.
# We need a new instance of MergeOptimizer to don't have its name
# changed by other usage of it.
optdb['canonicalize'].register("merge", gof.opt.MergeOptimizer(), 'fast_run',
"fast_compile", cleanup=True)
optdb.register('merge1.2', gof.MergeOptimizer(), optdb.register('merge1.2', gof.MergeOptimizer(),
1.2, 'fast_run', 'fast_compile', 'merge') 1.2, 'fast_run', 'fast_compile', 'merge')
......
...@@ -547,6 +547,7 @@ class CLinker(link.Linker): ...@@ -547,6 +547,7 @@ class CLinker(link.Linker):
if no_recycling is None: if no_recycling is None:
no_recycling = [] no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph: if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)(self.schedule).accept(fgraph, no_recycling) return type(self)(self.schedule).accept(fgraph, no_recycling)
self.fgraph = fgraph self.fgraph = fgraph
self.fetch_variables() self.fetch_variables()
...@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker): ...@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker):
if no_recycling is None: if no_recycling is None:
no_recycling = [] no_recycling = []
if self.fgraph is not None and self.fgraph is not fgraph: if self.fgraph is not None and self.fgraph is not fgraph:
# A linker can be tied to only one FunctionGraph.
return type(self)( return type(self)(
fallback_on_perform=self.fallback_on_perform, fallback_on_perform=self.fallback_on_perform,
allow_gc=self.allow_gc, allow_gc=self.allow_gc,
nice_errors=self.nice_errors, nice_errors=self.nice_errors,
schedule=self.schedule, schedule=self.schedule,
).accept(fgraph, no_recycling) ).accept(fgraph, no_recycling)
# raise Exception("Cannot accept from a Linker that is
# already tied to another FunctionGraph.")
self.fgraph = fgraph self.fgraph = fgraph
self.no_recycling = no_recycling self.no_recycling = no_recycling
return self return self
......
差异被折叠。
...@@ -268,28 +268,35 @@ class EquilibriumDB(DB): ...@@ -268,28 +268,35 @@ class EquilibriumDB(DB):
super(EquilibriumDB, self).__init__() super(EquilibriumDB, self).__init__()
self.ignore_newtrees = ignore_newtrees self.ignore_newtrees = ignore_newtrees
self.__final__ = {} self.__final__ = {}
self.__cleanup__ = {}
def register(self, name, obj, *tags, **kwtags): def register(self, name, obj, *tags, **kwtags):
if 'final_opt' in kwtags: final_opt = kwtags.pop('final_opt', False)
final_opt = kwtags['final_opt'] cleanup = kwtags.pop('cleanup', False)
kwtags.pop('final_opt', None) # An opt should not be final and clean up
else: assert not (final_opt and cleanup)
final_opt = False
super(EquilibriumDB, self).register(name, obj, *tags, **kwtags) super(EquilibriumDB, self).register(name, obj, *tags, **kwtags)
self.__final__[name] = final_opt self.__final__[name] = final_opt
self.__cleanup__[name] = cleanup
def query(self, *tags, **kwtags): def query(self, *tags, **kwtags):
_opts = super(EquilibriumDB, self).query(*tags, **kwtags) _opts = super(EquilibriumDB, self).query(*tags, **kwtags)
final_opts = [o for o in _opts if self.__final__.get(o.name, False)] final_opts = [o for o in _opts if self.__final__.get(o.name, False)]
opts = [o for o in _opts if o not in final_opts] cleanup_opts = [o for o in _opts if self.__cleanup__.get(o.name,
False)]
opts = [o for o in _opts
if o not in final_opts and o not in cleanup_opts]
if len(final_opts) == 0: if len(final_opts) == 0:
final_opts = None final_opts = None
if len(cleanup_opts) == 0:
cleanup_opts = None
return opt.EquilibriumOptimizer( return opt.EquilibriumOptimizer(
opts, opts,
max_use_ratio=config.optdb.max_use_ratio, max_use_ratio=config.optdb.max_use_ratio,
ignore_newtrees=self.ignore_newtrees, ignore_newtrees=self.ignore_newtrees,
failure_callback=opt.NavigatorOptimizer.warn_inplace, failure_callback=opt.NavigatorOptimizer.warn_inplace,
final_optimizers=final_opts) final_optimizers=final_opts,
cleanup_optimizers=cleanup_opts)
class SequenceDB(DB): class SequenceDB(DB):
......
...@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp): ...@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp):
const_shp = tensor.get_scalar_constant_value(s) const_shp = tensor.get_scalar_constant_value(s)
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
const_shp = None const_shp = None
bcast.append(numpy.all(1 == const_shp)) bcast.append(1 == const_shp)
otype = CudaNdarrayType(dtype='float32', broadcastable=bcast) otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
output = otype() output = otype()
return sh, output return sh, output
......
...@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor( ...@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor(
int nbDims, int nbDims,
const int dimA[], const int dimA[],
const int strideA[]) { const int strideA[]) {
if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED; if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetTensor4dDescriptorEx( return cudnnSetTensor4dDescriptorEx(
tensorDesc, dataType, tensorDesc, dataType,
dimA[0], dimA[1], dimA[2], dimA[3], dimA[0], dimA[1], dimA[2], dimA[3],
...@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor( ...@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor(
int nbDims, int nbDims,
const int windowDimA[], const int windowDimA[],
const int paddingA[], const int paddingA[],
const in strideA[]) { const int strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED; if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED; if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode, return cudnnSetPoolingDescriptor(poolingDesc, mode,
...@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor( ...@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor(
int strideA[]) { int strideA[]) {
int win0, win1, str0, str1; int win0, win1, str0, str1;
cudnnStatus_t err; cudnnStatus_t err;
if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED; if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1, err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1); &str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err; if (err != CUDNN_STATUS_SUCCESS) return err;
......
...@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)): ...@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
Subsampling stride (default: (1, 1)). Subsampling stride (default: (1, 1)).
mode : {'max', 'average_inc_pad', 'average_exc_pad} mode : {'max', 'average_inc_pad', 'average_exc_pad}
pad pad
(pad_h, pad_w) padding information. (pad_h, pad_w) padding information.
pad_h is the number of zero-valued pixels added to each of the top and pad_h is the number of zero-valued pixels added to each of the top and
bottom borders. bottom borders.
pad_w is the number of zero-valued pixels added to each of the left pad_w is the number of zero-valued pixels added to each of the left
......
...@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion', ...@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion',
'gpu') 'gpu')
# Register merge_optimizer as a global opt # Register merge_optimizer as a global opt
gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer, gpu_optimizer.register('gpu_merge', theano.gof.opt.MergeOptimizer(),
'fast_run', 'fast_compile', final_opt=True) 'fast_run', 'fast_compile', final_opt=True)
......
...@@ -81,7 +81,7 @@ class CudaNdarrayType(Type): ...@@ -81,7 +81,7 @@ class CudaNdarrayType(Type):
raise TypeError('%s only supports dtype float32 for now. Tried ' raise TypeError('%s only supports dtype float32 for now. Tried '
'using dtype %s for variable %s' % 'using dtype %s for variable %s' %
(self.__class__.__name__, dtype, name)) (self.__class__.__name__, dtype, name))
self.broadcastable = tuple(broadcastable) self.broadcastable = tuple(bool(b) for b in broadcastable)
self.name = name self.name = name
self.dtype_specs() # error checking is done there self.dtype_specs() # error checking is done there
......
...@@ -2673,7 +2673,7 @@ class Alloc(gof.Op): ...@@ -2673,7 +2673,7 @@ class Alloc(gof.Op):
const_shp = get_scalar_constant_value(s) const_shp = get_scalar_constant_value(s)
except NotScalarConstantError: except NotScalarConstantError:
const_shp = None const_shp = None
bcast.append(numpy.all(1 == const_shp)) bcast.append(1 == const_shp)
return sh, bcast return sh, bcast
def make_node(self, value, *shape): def make_node(self, value, *shape):
...@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op): ...@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op):
const_shp = get_scalar_constant_value(s) const_shp = get_scalar_constant_value(s)
except NotScalarConstantError: except NotScalarConstantError:
const_shp = None const_shp = None
bcast.append(numpy.all(1 == const_shp)) bcast.append(1 == const_shp)
otype = TensorType(dtype=self.dtype, broadcastable=bcast) otype = TensorType(dtype=self.dtype, broadcastable=bcast)
output = otype() output = otype()
return sh, output return sh, output
......
差异被折叠。
...@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op): ...@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op):
raise TypeError() raise TypeError()
# TODO: consider restricting the dtype? # TODO: consider restricting the dtype?
x = tensor.as_tensor_variable(x) x = tensor.as_tensor_variable(x)
return gof.Apply(self, [x], [x.type()]) # If the input shape are broadcastable we can have 0 in the output shape
broad = x.broadcastable[:2] + (False, False)
out = tensor.TensorType(x.dtype, broad)
return gof.Apply(self, [x], [out()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
......
...@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester): ...@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
[image_val, maxout_val, gz_val], [image_val, maxout_val, gz_val],
MaxPoolGrad, MaxPoolGrad,
warn=False) warn=False)
# checking with broadcastable input
image = tensor.tensor(dtype='float64',
broadcastable=(False, False, True, True))
image_val = rng.rand(4, 6, 1, 1)
self._compile_and_check(
[image],
[DownsampleFactorMax((2, 2),
ignore_border=True,
padding=(0, 0))(image)],
[image_val], DownsampleFactorMax)
def test_opt_max_to_average(self): def test_opt_max_to_average(self):
im = theano.tensor.tensor4() im = theano.tensor.tensor4()
......
...@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase): ...@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase):
mode = compile.mode.get_default_mode() mode = compile.mode.get_default_mode()
opt = gof.Query(["canonicalize"]) opt = gof.Query(["canonicalize"])
opt = opt.including('ShapeOpt') opt = opt.including('ShapeOpt', 'local_fill_to_alloc')
opt = opt.excluding( opt = opt.excluding(
'local_elemwise_fusion') 'local_elemwise_fusion')
mode = mode.__class__(linker=mode.linker, optimizer=opt) mode = mode.__class__(linker=mode.linker, optimizer=opt)
...@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase): ...@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase):
class T_useless_elemwise(unittest.TestCase): class T_useless_elemwise(unittest.TestCase):
def setUp(self): def setUp(self):
self.mode = theano.compile.get_default_mode().including('canonicalize') self.mode = theano.compile.get_default_mode().including(
'canonicalize', 'local_fill_to_alloc')
def test_eq(self): def test_eq(self):
x = T.dmatrix() x = T.dmatrix()
...@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase): ...@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the mul # test that we work without the mul
f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode) f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes) assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val))) assert all(numpy.isfinite(f(val)))
...@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase): ...@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase):
# test that we work without the sqr and neg # test that we work without the sqr and neg
f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode) f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes) assert len(f.maker.fgraph.apply_nodes) == 21, len(f.maker.fgraph.apply_nodes)
assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
assert all(numpy.isfinite(f(val))) assert all(numpy.isfinite(f(val)))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论