Merge pull request #3768 from nouiz/nanguardmode

Don't let scan AllocEmpty cause false alarm by NanGuardMode

Merge pull request #3768 from nouiz/nanguardmode
028459c3 · abergeron · 8d680018 · 5cb51d38 · 028459c3 · 028459c3
--- a/doc/extending/extending_theano.txt
+++ b/doc/extending/extending_theano.txt
@@ -913,6 +913,23 @@ documentation:
 .. automodule:: theano.misc.doubleop
    :members:
+NanGuardMode and AllocEmpty
+---------------------------
+NanGuardMode help users find where in the graph NaN appear. But
+sometimes, we want some variables to not be checked. For example, in
+the old GPU back-end, we use a float32 CudaNdarray to store the MRG
+random number generator state (they are integers). So if NanGuardMode
+check it, it will generate false positive. Another case is related to
+[Gpu]AllocEmpty or some computation on it (like done by Scan).
+You can tell NanGuardMode to do not check a variable with:
+``variable.tag.nan_guard_mode_check``. Also, this tag automatically
+follow that variable during optimization. This mean if you tag a
+variable that get replaced by an inplace version, it will keep that
+tag.
 Final Note
 ----------

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -199,7 +199,7 @@ def std_fgraph(input_specs, output_specs, accept_inplace=False):
    return fgraph, list(map(SymbolicOutput, updates))
-std_fgraph.features = [gof.toolbox.PreserveNames]
+std_fgraph.features = [gof.toolbox.PreserveVariableAttributes]
 class AliasedMemoryError(Exception):

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -416,7 +416,7 @@ def get_mode(orig_string):
        elif string == 'NanGuardMode':
            # need to import later to break circular dependency.
            from .nanguardmode import NanGuardMode
-            # DebugMode use its own linker.
+            # NanGuardMode use its own linker.
            ret = NanGuardMode(True, True, True, optimizer=config.optimizer)
        else:
            # This might be required if the string is 'ProfileMode'

--- a/theano/compile/nanguardmode.py
+++ b/theano/compile/nanguardmode.py
@@ -297,12 +297,14 @@ class NanGuardMode(Mode):
                # If the input is the result of computation, then we
                # don't need to check it. It is already done after the
                # computation.
-                if var.owner is not None:
+                if (var.owner is None and
+                        getattr(var.tag, 'nan_guard_mode_check', True)):
                    do_check_on(x[0], node, fn, True)
            fn()
            outputs = fn.outputs
-            for x in outputs:
+            for x, var in zip(outputs, node.outputs):
-                do_check_on(x[0], node, fn, False)
+                if getattr(var.tag, 'nan_guard_mode_check', True):
+                    do_check_on(x[0], node, fn, False)
        wrap_linker = theano.gof.WrapLinker([theano.gof.OpWiseCLinker()],
                                            nan_check)

--- a/theano/gof/toolbox.py
+++ b/theano/gof/toolbox.py
@@ -455,10 +455,28 @@ class PrintListener(Feature):
 class PreserveNames(Feature):
+    """
+    This preserve some variables names during optimization.
+    Deprecated. We need to keep it to allow unpickling.
+    """
+    def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
+        if r.name is not None and new_r.name is None:
+            new_r.name = r.name
+class PreserveVariableAttributes(Feature):
+    """
+    This preserve some variables attributes and tag during optimization.
+    """
    def on_change_input(self, fgraph, node, i, r, new_r, reason=None):
        if r.name is not None and new_r.name is None:
            new_r.name = r.name
+        if getattr(r.tag, 'nan_guard_mode_check', False) and getattr(
+                new_r.tag, 'nan_guard_mode_check', False) is False:
+            new_r.tag.nan_guard_mode_check = r.tag.nan_guard_mode_check
 class NoOutputFromInplace(Feature):

--- a/theano/misc/do_nightly_build_send
+++ b/theano/misc/do_nightly_build_send
@@ -51,8 +51,8 @@ def mysend(subject, file):
    # Open the files in binary mode.  Let the MIMEImage class automatically
    # guess the specific image type.
-    fp = open(file, 'rb')
+    with open(file, 'rb') as fp:
-    s=fp.read()
+        s=fp.read()
    failures=0
    errors=0
    ran=False
@@ -115,7 +115,6 @@ def mysend(subject, file):
    s = ("Summary of the output:\n\n" + filter_output(open(file)) +
         "\n\nFull output:\n\n" + s)
    img = MIMEText(s)
-    fp.close()
    msg.attach(img)
 # Send the email via our own SMTP server.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2436,7 +2436,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
    """
-    # __hash__, __eq__, __str__ come from tensor.Subtensor
+    # __hash__, __eq__, __str__ come from tensor.Reshape
    def make_node(self, x, shp):
        x = as_cuda_ndarray_variable(x)
        shp = tensor.as_tensor_variable(shp)
@@ -3680,6 +3680,7 @@ class GpuAllocEmpty(GpuOp):
        # The outut can contain nan/inf.  output.type is a new
        # instance, so we can do this only for that variable.
        output.type.filter_checks_isfinite = False
+        output.tag.nan_guard_mode_check = False
        return Apply(self, shape, [output])
    def debug_perform(self, node, inputs, out_):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1190,14 +1190,16 @@ def local_gpu_incsubtensor(node):
                # The IncSubtensor upcast to float32 y, so we do it
                # explicitly to move it to the GPU.
                y = y.astype('float32')
+            ret = GpuIncSubtensor(
-            return [GpuIncSubtensor(
                incsubt.idx_list,
                inplace=incsubt.inplace,
                set_instead_of_inc=incsubt.set_instead_of_inc)(
                    as_cuda_ndarray_variable(x),
                    as_cuda_ndarray_variable(y),
-                    *coords)]
+                    *coords)
+            ret.tag.nan_guard_mode_check = getattr(
+                host_output.tag, 'nan_guard_mode_check', True)
+            return [ret]
    # Incrementing a float32 x results in a float32
    # output even if y is float64, so we can downcast
    # y to put it on GPU
@@ -1221,10 +1223,16 @@ def local_gpu_incsubtensor(node):
                y = tensor.cast(y, 'float32')
            gpu_y = as_cuda_ndarray_variable(y)
        if go_gpu:
-            return [host_from_gpu(GpuIncSubtensor(
+            ret = GpuIncSubtensor(
                node.op.idx_list, inplace=node.op.inplace,
                set_instead_of_inc=node.op.set_instead_of_inc)(
-                    gpu_x, gpu_y, *coords))]
+                    gpu_x, gpu_y, *coords)
+            val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
+            ret.tag.nan_guard_mode_check = val
+            ret = host_from_gpu(ret)
+            ret.tag.nan_guard_mode_check = val
+            return [ret]
    return False
@@ -2532,6 +2540,20 @@ def local_gpu_allocempty(node):
    return False
+# Don't register by default.
+@gof.local_optimizer([GpuAllocEmpty])
+def local_gpu_alloc_empty_to_zeros(node):
+    # We need the exact match as GpuAlloc inherit from GpuAllocEmpty.
+    if type(node.op) is GpuAllocEmpty:
+        return [gpu_alloc(theano.tensor.constant(0, dtype='float32'),
+                          *node.inputs)]
+optdb.register('local_gpu_alloc_empty_to_zeros',
+               theano.tensor.opt.in2out(local_gpu_alloc_empty_to_zeros),
+               # After move to gpu and merge2, before inplace.
+               49.3,
+               'alloc_empty_to_zeros',)
 def typeInfer(node):
    return typeConstructor

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -721,6 +721,7 @@ class GpuAllocEmpty(HideC, Alloc):
        output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
        # The outut can contain nan/inf.
        output.type.filter_checks_isfinite = False
+        output.tag.nan_guard_mode_check = False
        return Apply(self, sh, [output])
    def debug_perform(self, node, inputs, out_, ctx):

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -300,6 +300,21 @@ def local_gpualloc_memset_0(node):
            return [new_op(*node.inputs)]
+# Don't register by default.
+@gof.local_optimizer([GpuAllocEmpty])
+def local_gpua_alloc_empty_to_zeros(node):
+    if isinstance(node.op, GpuAllocEmpty):
+        context_name = infer_context_name(*node.inputs)
+        z = numpy.asarray(0, dtype=node.outputs[0].dtype)
+        return [GpuAlloc()(as_gpuarray_variable(z, context_name),
+                           *node.inputs)]
+optdb.register('local_gpua_alloc_empty_to_zeros',
+               theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
+               # After move to gpu and merge2, before inplace.
+               49.3,
+               'alloc_empty_to_zeros',)
 @register_opt()
 @local_optimizer([GpuContiguous])
 def local_gpu_contiguous_gpu_contiguous(node):
@@ -569,9 +584,13 @@ def local_gpua_subtensor(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.IncSubtensor])
 def local_gpua_incsubtensor(node, context_name):
-    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
+    op = GpuIncSubtensor(node.op.idx_list, node.op.inplace,
-                           node.op.set_instead_of_inc,
+                         node.op.set_instead_of_inc,
-                           node.op.destroyhandler_tolerate_aliased)
+                         node.op.destroyhandler_tolerate_aliased)
+    ret = op(*node.inputs)
+    val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
+    ret.tag.nan_guard_mode_check = val
+    return ret
 @register_opt('fast_compile')

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -620,7 +620,9 @@ def expand_empty(tensor_var, size):
    new_shape = [size + shapes[0]] + shapes[1:]
    empty = tensor.AllocEmpty(tensor_var.dtype)(*new_shape)
-    return tensor.set_subtensor(empty[:shapes[0]], tensor_var)
+    ret = tensor.set_subtensor(empty[:shapes[0]], tensor_var)
+    ret.tag.nan_guard_mode_check = False
+    return ret
 def equal_computations(xs, ys, in_xs=None, in_ys=None):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -6241,6 +6241,13 @@ class AllocEmpty(gof.Op):
        # The outut can contain nan/inf.  output.type is a new
        # instance, so we can do this only for that variable.
        output.type.filter_checks_isfinite = False
+        # We can't reuse filter_checks_isfinite as by default it is
+        # False and it is set to true only in DebugMode.
+        # We can't set it in the type as other make_node can reuse the type.
+        # We can't set it in the variable as it isn't copied when we copy
+        # the variale. So we set it in the tag.
+        output.tag.nan_guard_mode_check = False
        return Apply(self, shape, [output])
    def debug_perform(self, node, inputs, out_):

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1733,6 +1733,26 @@ def local_useless_alloc(node):
            return [node.inputs[0]]
+# Don't register by default.
+@gof.local_optimizer([T.AllocEmpty])
+def local_alloc_empty_to_zeros(node):
+    """This convert AllocEmpty to Alloc of 0.
+    This help investigate NaN with NanGuardMode.  Not registered by
+    default. To activate it, use the Theano flag
+    optimizer_including=alloc_empty_to_zeros. This also enable
+    the GPU version of this optimizations.
+    """
+    if isinstance(node.op, T.AllocEmpty):
+        return [T.zeros(node.inputs, dtype=node.outputs[0].dtype)]
+compile.optdb.register('local_alloc_empty_to_zeros',
+                       in2out(local_alloc_empty_to_zeros),
+                       # After move to gpu and merge2, before inplace.
+                       49.3,
+                       'alloc_empty_to_zeros',)
 @register_specialize
 @register_canonicalize
 @gof.local_optimizer([T.shape])
@@ -3043,6 +3063,9 @@ def local_inplace_setsubtensor(node):
            set_instead_of_inc=node.op.set_instead_of_inc,
            destroyhandler_tolerate_aliased=dta)
        new_node = new_op(*node.inputs)
+        val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
+        new_node.tag.nan_guard_mode_check = val
        # Copy stacktrace from original outputs to new outputs.
        # This is sensible, because the new operation is the
        # same as the old one, but now with different attributes.