Merge pull request #3477 from nouiz/crash_gpu

Crash gpu and opt speed up

Merge pull request #3477 from nouiz/crash_gpu
f0bd940e · Pascal Lamblin · dab522df · 7fce44ca · f0bd940e · f0bd940e
--- a/doc/extending/optimization.txt
+++ b/doc/extending/optimization.txt
@@ -212,11 +212,11 @@ optimization you wrote. For example, consider the following:
 Nothing happened here. The reason is: ``add(y, z) != add(y,
 z)``. That is the case for efficiency reasons. To fix this problem we
 first need to merge the parts of the graph that represent the same
-computation, using the ``merge_optimizer`` defined in
+computation, using the ``MergeOptimizer`` defined in
 ``theano.gof.opt``.

->>> from theano.gof.opt import merge_optimizer
->>> merge_optimizer.optimize(e)  # doctest: +ELLIPSIS
+>>> from theano.gof.opt import MergeOptimizer
+>>> MergeOptimizer().optimize(e)  # doctest: +ELLIPSIS
 (0, ..., None, None, {}, 1, 0)
 >>> e
 [true_div(mul(*1 -> add(y, z), x), *1)]

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -198,8 +198,17 @@ optdb.register('merge1', gof.MergeOptimizer(),
               0, 'fast_run', 'fast_compile', 'merge')

 # rearranges elemwise expressions
-optdb.register('canonicalize', gof.EquilibriumDB(),
+optdb.register('canonicalize', gof.EquilibriumDB(ignore_newtrees=False),
               1, 'fast_run', 'fast_compile')
+# Register in the canonizer Equilibrium as a clean up opt the merge opt.
+# Without this, as the equilibrium have ignore_newtrees=False, we
+# won't merge all nodes if it is set as a global optimizer with
+# final_opt=True.
+
+# We need a new instance of MergeOptimizer to don't have its name
+# changed by other usage of it.
+optdb['canonicalize'].register("merge", gof.opt.MergeOptimizer(), 'fast_run',
+                               "fast_compile", cleanup=True)

 optdb.register('merge1.2', gof.MergeOptimizer(),
               1.2, 'fast_run', 'fast_compile', 'merge')

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -547,6 +547,7 @@ class CLinker(link.Linker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
+            # A linker can be tied to only one FunctionGraph.
            return type(self)(self.schedule).accept(fgraph, no_recycling)
        self.fgraph = fgraph
        self.fetch_variables()
@@ -1750,14 +1751,13 @@ class OpWiseCLinker(link.LocalLinker):
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
+            # A linker can be tied to only one FunctionGraph.
            return type(self)(
                fallback_on_perform=self.fallback_on_perform,
                allow_gc=self.allow_gc,
                nice_errors=self.nice_errors,
                schedule=self.schedule,
            ).accept(fgraph, no_recycling)
-            # raise Exception("Cannot accept from a Linker that is
-            # already tied to another FunctionGraph.")
        self.fgraph = fgraph
        self.no_recycling = no_recycling
        return self

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -268,28 +268,35 @@ class EquilibriumDB(DB):
        super(EquilibriumDB, self).__init__()
        self.ignore_newtrees = ignore_newtrees
        self.__final__ = {}
+        self.__cleanup__ = {}

    def register(self, name, obj, *tags, **kwtags):
-        if 'final_opt' in kwtags:
-            final_opt = kwtags['final_opt']
-            kwtags.pop('final_opt', None)
-        else:
-            final_opt = False
+        final_opt = kwtags.pop('final_opt', False)
+        cleanup = kwtags.pop('cleanup', False)
+        # An opt should not be final and clean up
+        assert not (final_opt and cleanup)
        super(EquilibriumDB, self).register(name, obj, *tags, **kwtags)
        self.__final__[name] = final_opt
+        self.__cleanup__[name] = cleanup

    def query(self, *tags, **kwtags):
        _opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        final_opts = [o for o in _opts if self.__final__.get(o.name, False)]
-        opts = [o for o in _opts if o not in final_opts]
+        cleanup_opts = [o for o in _opts if self.__cleanup__.get(o.name,
+                                                                 False)]
+        opts = [o for o in _opts
+                if o not in final_opts and o not in cleanup_opts]
        if len(final_opts) == 0:
            final_opts = None
+        if len(cleanup_opts) == 0:
+            cleanup_opts = None
        return opt.EquilibriumOptimizer(
            opts,
            max_use_ratio=config.optdb.max_use_ratio,
            ignore_newtrees=self.ignore_newtrees,
            failure_callback=opt.NavigatorOptimizer.warn_inplace,
-            final_optimizers=final_opts)
+            final_optimizers=final_opts,
+            cleanup_optimizers=cleanup_opts)


 class SequenceDB(DB):

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3622,7 +3622,7 @@ class GpuAllocEmpty(GpuOp):
                const_shp = tensor.get_scalar_constant_value(s)
            except tensor.NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
        output = otype()
        return sh, output

--- a/theano/sandbox/cuda/cudnn_helper.h
+++ b/theano/sandbox/cuda/cudnn_helper.h
@@ -48,7 +48,7 @@ cudnnSetTensorNdDescriptor(
  int nbDims,
  const int dimA[],
  const int strideA[]) {
-  if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
  return cudnnSetTensor4dDescriptorEx(
    tensorDesc, dataType,
    dimA[0], dimA[1], dimA[2], dimA[3],
@@ -204,7 +204,7 @@ cudnnSetPoolingNdDescriptor(
  int nbDims,
  const int windowDimA[],
  const int paddingA[],
-  const in strideA[]) {
+  const int strideA[]) {
  if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
  if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
  return cudnnSetPoolingDescriptor(poolingDesc, mode,
@@ -223,7 +223,7 @@ cudnnGetPoolingNdDescriptor(
  int strideA[]) {
  int win0, win1, str0, str1;
  cudnnStatus_t err;
-  if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
+  if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
  err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
                                  &str0, &str1);
  if (err != CUDNN_STATUS_SUCCESS) return err;

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1760,7 +1760,7 @@ def dnn_pool(img, ws, stride=(1, 1), mode='max', pad=(0, 0)):
        Subsampling stride (default: (1, 1)).
    mode : {'max', 'average_inc_pad', 'average_exc_pad}
    pad
-    	(pad_h, pad_w) padding information.
+        (pad_h, pad_w) padding information.
        pad_h is the number of zero-valued pixels added to each of the top and
        bottom borders.
        pad_w is the number of zero-valued pixels added to each of the left

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -104,7 +104,7 @@ optdb.register('gpu_after_fusion',
               'gpu')

 # Register merge_optimizer as a global opt
-gpu_optimizer.register('gpu_merge', theano.gof.opt.merge_optimizer,
+gpu_optimizer.register('gpu_merge', theano.gof.opt.MergeOptimizer(),
                       'fast_run', 'fast_compile', final_opt=True)



--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -81,7 +81,7 @@ class CudaNdarrayType(Type):
            raise TypeError('%s only supports dtype float32 for now. Tried '
                            'using dtype %s for variable %s' %
                            (self.__class__.__name__, dtype, name))
-        self.broadcastable = tuple(broadcastable)
+        self.broadcastable = tuple(bool(b) for b in broadcastable)
        self.name = name
        self.dtype_specs()  # error checking is done there


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2673,7 +2673,7 @@ class Alloc(gof.Op):
                const_shp = get_scalar_constant_value(s)
            except NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        return sh, bcast

    def make_node(self, value, *shape):
@@ -6037,7 +6037,7 @@ class AllocEmpty(gof.Op):
                const_shp = get_scalar_constant_value(s)
            except NotScalarConstantError:
                const_shp = None
-            bcast.append(numpy.all(1 == const_shp))
+            bcast.append(1 == const_shp)
        otype = TensorType(dtype=self.dtype, broadcastable=bcast)
        output = otype()
        return sh, output

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -256,7 +256,10 @@ class DownsampleFactorMax(Op):
            raise TypeError()
        # TODO: consider restricting the dtype?
        x = tensor.as_tensor_variable(x)
-        return gof.Apply(self, [x], [x.type()])
+        # If the input shape are broadcastable we can have 0 in the output shape
+        broad = x.broadcastable[:2] + (False, False)
+        out = tensor.TensorType(x.dtype, broad)
+        return gof.Apply(self, [x], [out()])

    def perform(self, node, inp, out):
        x, = inp

--- a/theano/tensor/signal/tests/test_downsample.py
+++ b/theano/tensor/signal/tests/test_downsample.py
@@ -801,6 +801,16 @@ class TestDownsampleFactorMax(utt.InferShapeTester):
                                            [image_val, maxout_val, gz_val],
                                            MaxPoolGrad,
                                            warn=False)
+        # checking with broadcastable input
+        image = tensor.tensor(dtype='float64',
+                              broadcastable=(False, False, True, True))
+        image_val = rng.rand(4, 6, 1, 1)
+        self._compile_and_check(
+            [image],
+            [DownsampleFactorMax((2, 2),
+                                 ignore_border=True,
+                                 padding=(0, 0))(image)],
+            [image_val], DownsampleFactorMax)

    def test_opt_max_to_average(self):
        im = theano.tensor.tensor4()

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -481,7 +481,7 @@ class test_canonize(unittest.TestCase):
        mode = compile.mode.get_default_mode()

        opt = gof.Query(["canonicalize"])
-        opt = opt.including('ShapeOpt')
+        opt = opt.including('ShapeOpt', 'local_fill_to_alloc')
        opt = opt.excluding(
            'local_elemwise_fusion')
        mode = mode.__class__(linker=mode.linker, optimizer=opt)
@@ -4021,7 +4021,8 @@ class T_Rebroadcast(unittest.TestCase):

 class T_useless_elemwise(unittest.TestCase):
    def setUp(self):
-        self.mode = theano.compile.get_default_mode().including('canonicalize')
+        self.mode = theano.compile.get_default_mode().including(
+            'canonicalize', 'local_fill_to_alloc')

    def test_eq(self):
        x = T.dmatrix()
@@ -4545,7 +4546,7 @@ class T_local_erfc(unittest.TestCase):

        # test that we work without the mul
        f = theano.function([x], T.exp(T.neg(T.sqr(x))) / T.erfc(x), mode=mode)
-        assert len(f.maker.fgraph.apply_nodes) == 23, len(f.maker.fgraph.apply_nodes)
+        assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
        assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
        assert all(numpy.isfinite(f(val)))

@@ -4558,7 +4559,7 @@ class T_local_erfc(unittest.TestCase):

        # test that we work without the sqr and neg
        f = theano.function([x], T.exp(T.mul(-1, x, x)) / T.erfc(x), mode=mode)
-        assert len(f.maker.fgraph.apply_nodes) == 22, len(f.maker.fgraph.apply_nodes)
+        assert len(f.maker.fgraph.apply_nodes) == 21, len(f.maker.fgraph.apply_nodes)
        assert f.maker.fgraph.outputs[0].dtype == theano.config.floatX
        assert all(numpy.isfinite(f(val)))