Merge pull request #1441 from nouiz/fix_opt_crash

Fix opt crash of local_gpu_lazy_ifelse.

Merge pull request #1441 from nouiz/fix_opt_crash
51b39ada · lamblin · 2faeb62c · 5d485774 · 51b39ada · 51b39ada
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1636,7 +1636,8 @@ class _Linker(gof.link.LocalLinker):
                if not isinstance(node.op, gof.op.Op):
                    raise utils.MethodNotDefined()
                e = FunctionGraph(*graph.clone(node.inputs, node.outputs))
-                e.toposort = lambda: e.apply_nodes  # WARNING: STOCHASTIC ORDER
+                # The toposort isn't a stochastic order as it contain only one node.
+                e.toposort = lambda: list(e.apply_nodes)
                #  Specifically... e.nodes is a set, but of only 1 element
                cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs,
@@ -1679,6 +1680,8 @@ class _Linker(gof.link.LocalLinker):
                                           storage_map,
                                           compute_map,
                                           no_recycling)
+                thunk.inputs = [storage_map[v] for v in node.inputs]
+                thunk.outputs = [storage_map[v] for v in node.outputs]
                # Right now there is no op that when called check if
                # its ouputs are computed and don't recompute itself.

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1498,6 +1498,9 @@ class OpWiseCLinker(link.LocalLinker):
                                        storage_map,
                                        compute_map,
                                        no_recycling)]
+                    thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+                    thunks[-1].outputs = [storage_map[v] for v in node.outputs]
                finally:
                    node.op._op_use_c_code = old_value

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -112,29 +112,37 @@ def raise_with_op(op, thunk=None, exc_info=None):
    if raise_with_op.print_thunk_trace:
        log_thunk_trace(exc_value)
-    if theano.config.exception_verbosity == 'high':
+    detailed_err_msg = "\nApply node that caused the error: " + str(op)
-        f = StringIO.StringIO()
-        theano.printing.debugprint(op, file=f, stop_on_name=True)
+    if thunk is not None:
-        if thunk is not None:
+        if hasattr(thunk, 'inputs'):
            shapes = [getattr(ipt[0], 'shape', 'No shapes')
                      for ipt in thunk.inputs]
            strides = [getattr(ipt[0], 'strides', 'No strides')
                       for ipt in thunk.inputs]
-            detailed_err_msg = ("\nInputs shapes: %s \n" % shapes +
-                                "Inputs strides: %s \n" % strides +
-                                "Debugprint of the apply node: \n" +
-                                f.getvalue())
        else:
-            detailed_err_msg = "\nDebugprint of the apply node: \n" + f.getvalue()
+            shapes = "The thunk don't have an inputs attributes."
+            strides = "So we can't access the storage inputs value"
+        types = [getattr(ipt, 'type', 'No type')
+                 for ipt in op.inputs]
+        detailed_err_msg += ("\nInputs shapes: %s" % shapes +
+                             "\nInputs strides: %s" % strides +
+                             "\nInputs types: %s" % types)
+    else:
+        detailed_err_msg += ("\nUse another linker then the c linker to"
+                             " have the inputs shapes and strides printed.")
+    if theano.config.exception_verbosity == 'high':
+        f = StringIO.StringIO()
+        theano.printing.debugprint(op, file=f, stop_on_name=True,
+                                   print_type=True)
+        detailed_err_msg += "\nDebugprint of the apply node: \n" + f.getvalue()
    else:
-        detailed_err_msg = ("\nUse the Theano flag"
+        detailed_err_msg += ("\nUse the Theano flag 'exception_verbosity=high'"
-                            " 'exception_verbosity=high' for more"
+                             " for a debugprint of this apply node.")
-                            " information on the inputs of this apply"
-                            " node.")
+    exc_value = exc_type(str(exc_value) + detailed_err_msg)
-    exc_value = exc_type(str(exc_value) +
-                         "\nApply node that caused the error: " + str(op) +
-                         detailed_err_msg)
    raise exc_type, exc_value, exc_trace
 raise_with_op.print_thunk_trace = False
@@ -523,6 +531,8 @@ class PerformLinker(LocalLinker):
                                              storage_map,
                                              compute_map,
                                              no_recycling)]
+                thunks[-1].inputs = [storage_map[v] for v in node.inputs]
+                thunks[-1].outputs = [storage_map[v] for v in node.outputs]
            finally:
                node.op._op_use_c_code = old_value

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -431,6 +431,8 @@ class PureOp(object):
                # compute output value once with test inputs to validate graph
                thunk = node.op.make_thunk(node, storage_map, compute_map,
                        no_recycling=[])
+                thunk.inputs = [storage_map[v] for v in node.inputs]
+                thunk.outputs = [storage_map[v] for v in node.outputs]
                required = thunk()
                assert not required  # We provided all inputs

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -843,6 +843,9 @@ class VM_Linker(link.LocalLinker):
                    compute_map,
                    no_recycling)
                        for node in order]
+        for node, thunk in zip(order, thunks):
+            thunk.inputs = [storage_map[v] for v in node.inputs]
+            thunk.outputs = [storage_map[v] for v in node.outputs]
        computed, last_user = link.gc_helper(order)
        if self.allow_gc:

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -652,7 +652,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 19)
+        return (0, 20)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
@@ -704,6 +704,7 @@ class GpuConv(GpuOp):
        return NULL;
    }
+    // TODO, make out be decref before we alloc out2!
    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
                                                         %(out)s, mode,
                                                         dx, dy,
@@ -711,6 +712,10 @@ class GpuConv(GpuOp):
                                                         %(max_threads_dim0)s);
    Py_XDECREF(%(out)s);
    %(out)s = out2;
+    if (%(out)s==NULL){
+        %(fail)s
+    }
 """ % sub

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1811,9 +1811,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                    {
                        PyErr_Format(
                            PyExc_RuntimeError,
-                            "Cuda error: %s: %s.\n",
+                            "Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
-                            "k4",
+                            "k5 with loop over k4",
-                            cudaGetErrorString(err));
+                            cudaGetErrorString(err),
+                            (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
                        Py_XDECREF(new_other);
                        return -1;
                    }
@@ -1831,14 +1832,17 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        );
                while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
                    n_blocks.y /= 2;
-                while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
+                // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
-                    n_blocks.z /= 2;
+                // (compute capabilities 2.0) support 65535 for n_blocks.z
+                //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
+                //    n_blocks.z /= 2;
+                n_blocks.z = 1;
                dim3 n_threads(
                        std::min(
                            CudaNdarray_HOST_DIMS(self)[3],
                            NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                    //TODO: DON"T YOU NEED OT PUT DIMS[4] in here???
+                    //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
-                    //TODO: DON"T YOU NEED OT PUT DIMS[5] in here???
+                    //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
                            );
                k6<<<n_blocks, n_threads>>>(
                        CudaNdarray_HOST_DIMS(self)[0],
@@ -1867,9 +1871,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                {
                    PyErr_Format(
                        PyExc_RuntimeError,
-                        "Cuda error: %s: %s.\n",
+                        "Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
-                        "k4",
+                        "k6",
-                        cudaGetErrorString(err));
+                        cudaGetErrorString(err),
+                        (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
+                        (long) n_threads.x);
                    Py_XDECREF(new_other);
                    return -1;
                }

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -403,7 +403,12 @@ def local_gpu_lazy_ifelse(node):
        host_input = node.inputs[0]
        if (host_input.owner and
            isinstance(host_input.owner.op, theano.ifelse.IfElse) and
-            not host_input.owner.op.gpu):
+            not host_input.owner.op.gpu and
+            # If there is more then 1 outputs, we can't replace it
+            # here with a local optimizer as we replace the
+            # GpuFromHost node and the other output of the if won't be
+            # replaced.
+            host_input.owner.op.n_outs == 1):
            gpu_ifelse = theano.ifelse.IfElse(host_input.owner.op.n_outs,
                                                  gpu=True)

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
-import sys, time, unittest
+import sys
 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -7,7 +7,7 @@ from nose.plugins.skip import SkipTest
 import theano
 from theano.compile.pfunc import pfunc
 from theano import config, tensor
-import theano.sandbox.linalg.tests
+import theano.sandbox.linalg.tests.test_linalg
 from theano.tests import unittest_tools as utt
@@ -48,28 +48,29 @@ def test_int_pow():
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
    assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
-    f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
+    f = theano.function([a], tensor.pow(a, 4).sum(), mode=mode_with_gpu)
    op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
    assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
    #theano.printing.debugprint(f)
 def test_gpualloc():
    '''
    This tests tries to catch the scenario when, due to infer_shape,
-    the input of the alloc changes from tesnor scalar to a constant
+    the input of the alloc changes from tensor scalar to a constant
    1. In this case the original constracted broadcastable pattern will
    have a False for that dimension, but the new broadcastable pattern
    that will be inserted by gpualloc will have  a True since it knows the
    dimension is 1 and therefore broadcastable.
    '''
-    x = theano.shared(numpy.ones(3,dtype='float32'), 'x')
+    x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
-    m = (x).dimshuffle(['x',0])
+    m = (x).dimshuffle(['x', 0])
    v = tensor.alloc(1., *m.shape)
-    f = theano.function([], v+x)
+    f = theano.function([], v + x, mode=mode_with_gpu)
    l = f.maker.fgraph.toposort()
-    assert numpy.any(ininstance(x.op, cuda.GpuAlloc) for x in l )
+    assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
 def test_alloc_memset_0():

--- a/theano/tests/test_ifelse.py
+++ b/theano/tests/test_ifelse.py
@@ -159,6 +159,49 @@ class test_ifelse(unittest.TestCase, utt.TestOptimizationMixin):
        assert numpy.all(outs_0[2] == 1.)
        assert numpy.all(outs_0[3] == 1.)
+    def test_multiple_out_crash(self):
+        # This test failed up to commit 2faeb62c38
+        p0 = self.shared(numpy.asarray(numpy.random.random([4, 8]),
+                                       dtype=self.dtype))
+        p1 = self.shared(numpy.asarray(numpy.random.random(8),
+                                       dtype=self.dtype))
+        p2 = self.shared(numpy.asarray(numpy.random.random([8, 3]),
+                                       dtype=self.dtype))
+        p3 = self.shared(numpy.asarray(numpy.random.random(3),
+                                       dtype=self.dtype))
+        p = [p0, p1, p2, p3]
+        # in my code these vars are the result of applying scan
+        ften0 = tensor.tensor3('ft0', dtype=self.dtype)
+        fmat1 = tensor.matrix('fm1', dtype=self.dtype)
+        ften2 = tensor.tensor3('ft2', dtype=self.dtype)
+        fmat3 = tensor.matrix('fm3', dtype=self.dtype)
+        # then I keep only the last iteration
+        fsub0 = ften0[-1]
+        fsub1 = fmat1[-1]
+        fsub2 = ften2[-1]
+        fsub3 = fmat3[-1]
+        fsub = [fsub0, fsub1, fsub2, fsub3]
+        acc = theano.tensor.constant(1, 'int8') >= 0
+        new_positions = theano.ifelse.ifelse(acc, fsub, p)
+        new_updates = [(p[0], new_positions[0])]
+        f = theano.function([ften0, fmat1, ften2, fmat3], [],
+                            updates=new_updates, mode=self.mode)
+        self.assertFunctionContains1(f, self.get_ifelse(4))
+        i1 = numpy.asarray(numpy.random.random([19, 4, 8]), dtype=self.dtype)
+        i2 = numpy.asarray(numpy.random.random([19, 8]), dtype=self.dtype)
+        i3 = numpy.asarray(numpy.random.random([19, 8, 3]), dtype=self.dtype)
+        i4 = numpy.asarray(numpy.random.random([19, 3]), dtype=self.dtype)
+        f(i1, i2, i3, i4)
    def test_dtype_mismatch(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        data = rng.rand(5).astype(self.dtype)