Merge pull request #5902 from lamblin/fix_debugmode

[BUG, CRASH] Fixes in DebugMode for GPU

Merge pull request #5902 from lamblin/fix_debugmode
26254645 · Frédéric Bastien · GitHub · fbb066c4 · be375941 · 26254645
--- a/.jenkins/jenkins_buildbot_python2_debug.sh
+++ b/.jenkins/jenkins_buildbot_python2_debug.sh
@@ -9,6 +9,40 @@ export PATH=/usr/local/cuda/bin:$PATH
 export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+GPUARRAY_CONFIG="Release"
+DEVICE=cuda0
+LIBDIR=${WORKSPACE}/local
+# Make fresh clones of libgpuarray (with no history since we don't need it)
+rm -rf libgpuarray
+git clone --depth 1 "https://github.com/Theano/libgpuarray.git"
+# Clean up previous installs (to make sure no old files are left)
+rm -rf $LIBDIR
+mkdir $LIBDIR
+# Build libgpuarray
+mkdir libgpuarray/build
+(cd libgpuarray/build && cmake .. -DCMAKE_BUILD_TYPE=${GPUARRAY_CONFIG} -DCMAKE_INSTALL_PREFIX=$LIBDIR && make)
+# Finally install
+(cd libgpuarray/build && make install)
+# Export paths
+export CPATH=$CPATH:$LIBDIR/include
+export LIBRARY_PATH=$LIBRARY_PATH:$LIBDIR/lib
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR/lib
+# Build the pygpu modules
+(cd libgpuarray && python setup.py build_ext --inplace -I$LIBDIR/include -L$LIBDIR/lib)
+ls $LIBDIR
+mkdir $LIBDIR/lib/python
+export PYTHONPATH=${PYTHONPATH}:$LIBDIR/lib/python
+# Then install
+(cd libgpuarray && python setup.py install --home=$LIBDIR)
+python -c 'import pygpu; print(pygpu.__file__)'
 # nosetests xunit for test profiling
 XUNIT="--with-xunit --xunit-file="

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -73,8 +73,12 @@ class GpuGemv(BlasOp):
        inplace = self.inplace
        if inplace and y.strides[0] < 0:
            inplace = False
-        out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
+        if A.shape[1] == 0:
-                                      overwrite_y=inplace)
+            out_storage[0][0] = pygpu.zeros(y.shape, dtype=y.dtype,
+                                            context=y.context)
+        else:
+            out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
+                                          overwrite_y=inplace)
    def c_code(self, node, name, inp, out, sub):
        vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
@@ -119,11 +123,11 @@ class GpuGemv(BlasOp):
            if (%(A)s->ga.flags & GA_C_CONTIGUOUS) {
                ssize_t a_stride0 = %(A)s->ga.strides[0];
                %(A)s->ga.strides[0] = %(A)s->ga.strides[1];
-                if (pygpu_blas_rdot(%(x)s, %(A)s, %(y)s, 0) == -1) {
+                if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
                    %(fail)s
                }
                %(A)s->ga.strides[0] = a_stride0;
-            } else if (pygpu_blas_rdot(%(x)s, %(A)s, %(y)s, 0) == -1) {
+            } else if (pygpu_blas_rdot(%(x)s, %(A)s, %(out)s, 0) == -1) {
                %(fail)s
            }
            %(out)s->ga.nd = 1;
@@ -145,7 +149,7 @@ class GpuGemv(BlasOp):
        return code
    def c_code_cache_version(self):
-        return (6,)
+        return (7,)
 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)

--- a/theano/gpuarray/multinomial.py
+++ b/theano/gpuarray/multinomial.py
@@ -309,39 +309,29 @@ KERNEL void k_multi_warp_multinomial_wor(
    if (n < nb_multi)
    {
+        // Sum of the remaining p_vals in global_pvals_copy[n]
+        float pvals_sum = 1.;
        for (int c = 0; c < n_samples; ++c)
        {
            float cummul = 0.;
-            bool done = false;
+            const float unis_n = global_unis[(c * nb_multi + n)*unis_stride] * pvals_sum;
-            const float unis_n = global_unis[(c * nb_multi + n)*unis_stride];
            for (ga_size m = 0; m < nb_outcomes; ++m)
            {
                float pvals_nm = global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride];
                cummul += pvals_nm;
-                if (!done && unis_n < cummul)
+                if (unis_n < cummul)
                {
-                    //write out transposed for speed.
+                    // write out transposed for speed.
                    global_outs[n * outs_col_stride +
                                c * outs_row_stride] = m;
                    if (! %(replace)s )
                    {
                        global_pvals_copy[m * pvals_col_stride + n * pvals_row_stride] = 0.0;
-                        cummul -= pvals_nm;
+                        pvals_sum -= pvals_nm;
                    }
-                    done = true;
+                    break;
-                }
-            }
-            // No need to renormalize after the last samples.
-            if (c == (n_samples - 1))
-                break;
-            if (! %(replace)s )
-            {
-                // parallel renormalize the multinomial
-                for (ga_int k = LID_1; k < nb_outcomes; k+=LDIM_1)
-                {
-                    global_pvals_copy[k * pvals_col_stride + n * pvals_row_stride] /= cummul;
                }
            }
        }
@@ -402,9 +392,12 @@ KERNEL void k_multi_warp_multinomial_wor(
        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0] * n");
        %(fail)s
    }
+    if (! %(replace)s) {
-    pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
+        pvals_copy = pygpu_copy(pvals, GA_C_ORDER);
+    } else {
+        pvals_copy = pvals;
+        Py_INCREF(pvals_copy);
+    }
    dims[0] = n_samples;
    dims[1] = PyGpuArray_DIMS(pvals)[0];
@@ -466,18 +459,7 @@ KERNEL void k_multi_warp_multinomial_wor(
        args[9] = (void*)&strides[3];
        args[10] = (void*)&strides[4];
-        size_t nb_threads2[2], nb_blocks2[2];
+        err = GpuKernel_call(&%(kname)s, 1, &nb_blocks, &nb_threads, 0, args);
-        nb_threads2[0] = nb_threads;
-        nb_threads2[1] = 1;
-        // If we can't schedule enough threads parallelize the renormalization.
-        // I do this because we don't always use those extra threads.
-        if ((nb_threads * nb_blocks < 2048) && ! %(replace)d )
-            nb_threads2[1] = 1024 / nb_threads;
-        nb_blocks2[0] = nb_blocks;
-        nb_blocks2[1] = 1;
-        err = GpuKernel_call(&%(kname)s, 2, nb_blocks2, nb_threads2, 0, args);
        if (err != GA_NO_ERROR) {
           PyErr_Format(
                PyExc_RuntimeError,
@@ -495,7 +477,7 @@ KERNEL void k_multi_warp_multinomial_wor(
        return s
    def c_code_cache_version(self):
-        return (4,)
+        return (7,)
 @register_opt('fast_compile')
@@ -528,7 +510,7 @@ def local_gpua_multinomial_wor(op, context_name, inputs, outputs):
    p, u, n = inputs
    m, = outputs
    if ((p.dtype == u.dtype == 'float32') and (m.dtype == 'int64')):
-        gpu_op = GPUAChoiceFromUniform(op.odtype)
+        gpu_op = GPUAChoiceFromUniform(**op._props_dict())
        return GpuDimShuffle([False, False], [1, 0])(
            gpu_op(p, u, n))

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -481,13 +481,12 @@ def test_hostfromgpu_shape_i():
 def test_Gpujoin_inplace():
-    """Test Gpujoin to work inplace.
+    # Test Gpujoin to work inplace.
+    #
-    This function tests the case when several elements are passed to the
+    # This function tests the case when several elements are passed to the
-    Gpujoin function but all except one of them are empty. In this case
+    # Gpujoin function but all except one of them are empty. In this case
-    Gpujoin should work inplace and the output should be the view of the
+    # Gpujoin should work inplace and the output should be the view of the
-    non-empty element.
+    # non-empty element.
-    """
    s = T.lscalar()
    data = np.array([3, 4, 5], dtype=theano.config.floatX)
    x = gpuarray_shared_constructor(data, borrow=True)
@@ -497,5 +496,6 @@ def test_Gpujoin_inplace():
    c = join(0, x, z)
    f = theano.function([s], theano.Out(c, borrow=True))
-    assert x.get_value(borrow=True, return_internal_type=True) is f(0)
+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
    assert np.allclose(f(0), [3, 4, 5])
--- a/theano/gpuarray/tests/test_elemwise.py
+++ b/theano/gpuarray/tests/test_elemwise.py
 from __future__ import absolute_import, print_function, division
+from copy import copy
+from unittest import TestCase
 import numpy as np
 import scipy.special
 import theano
 from theano import scalar, gof, tensor
-from unittest import TestCase
+from theano.compile import DebugMode
 from theano.tests.unittest_tools import SkipTest, assert_allclose
 from theano.tensor.tests import test_elemwise
@@ -66,18 +69,32 @@ class TestMathErrorFunctions(TestCase):
    expected_erfinv_outputs = {}
    expected_erfcinv_outputs = {}
-    def setUp(self):
+    @classmethod
+    def setUpClass(cls):
        # NB: erfinv is defined in ]-1;1[, and erfcinv is defined in ]0;2[,
        # so we just take some values in an interval that covers both domains
        # (this will also allow to test some values outside the domains).
        # We take [-5;5[ by default and we concatenate it 1000 times
        # to have the GPU ops run on large data.
        default_array = [x / 10.0 for x in range(-50, 50)] * 1000
-        for dtype in self.dtypes:
+        for dtype in cls.dtypes:
            numpy_array = np.asarray(default_array, dtype=dtype)
-            self.default_arrays[dtype] = numpy_array
+            cls.default_arrays[dtype] = numpy_array
-            self.expected_erfinv_outputs[dtype] = scipy.special.erfinv(numpy_array)
+            cls.expected_erfinv_outputs[dtype] = scipy.special.erfinv(numpy_array)
-            self.expected_erfcinv_outputs[dtype] = scipy.special.erfcinv(numpy_array)
+            cls.expected_erfcinv_outputs[dtype] = scipy.special.erfcinv(numpy_array)
+        # Since there are infinite values, we need to disable that check
+        # in DebugMode if needed
+        if isinstance(mode_with_gpu, DebugMode):
+            cls.mode_with_gpu = copy(mode_with_gpu)
+            cls.mode_with_gpu.check_isfinite = False
+        else:
+            cls.mode_with_gpu = mode_with_gpu
+        if isinstance(mode_without_gpu, DebugMode):
+            cls.mode_without_gpu = copy(mode_without_gpu)
+            cls.mode_without_gpu.check_isfinite = False
+        else:
+            cls.mode_without_gpu = mode_without_gpu
    def check_gpu_scalar_op(self, theano_function, scalar_optype):
        for node in theano_function.maker.fgraph.apply_nodes:
@@ -90,8 +107,8 @@ class TestMathErrorFunctions(TestCase):
        for dtype in self.dtypes:
            vector = theano.tensor.vector(dtype=dtype)
            output = theano.tensor.erfinv(vector)
-            f_host = theano.function([vector], output, name='HOST/erfinv/' + dtype, mode=mode_without_gpu)
+            f_host = theano.function([vector], output, name='HOST/erfinv/' + dtype, mode=self.mode_without_gpu)
-            f_gpu = theano.function([vector], output, name='GPU/erfinv/' + dtype, mode=mode_with_gpu)
+            f_gpu = theano.function([vector], output, name='GPU/erfinv/' + dtype, mode=self.mode_with_gpu)
            assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0
            if not theano.config.device.startswith('opencl'):
                assert self.check_gpu_scalar_op(f_gpu, GpuErfinv), \
@@ -108,8 +125,8 @@ class TestMathErrorFunctions(TestCase):
        for dtype in self.dtypes:
            vector = theano.tensor.vector(dtype=dtype)
            output = theano.tensor.erfcinv(vector)
-            f_host = theano.function([vector], output, name='HOST/erfcinv/' + dtype, mode=mode_without_gpu)
+            f_host = theano.function([vector], output, name='HOST/erfcinv/' + dtype, mode=self.mode_without_gpu)
-            f_gpu = theano.function([vector], output, name='GPU/erfcinv/' + dtype, mode=mode_with_gpu)
+            f_gpu = theano.function([vector], output, name='GPU/erfcinv/' + dtype, mode=self.mode_with_gpu)
            assert len([n for n in f_host.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) == 0
            if not theano.config.device.startswith('opencl'):
                assert self.check_gpu_scalar_op(f_gpu, GpuErfcinv), \

--- a/theano/gpuarray/tests/test_extra_ops.py
+++ b/theano/gpuarray/tests/test_extra_ops.py
@@ -32,6 +32,17 @@ class TestGpuCumOp(theano.tensor.tests.test_extra_ops.TestCumOp):
        self.max_grid_size1 = test_ctx.maxgsize2
        self.op_class = CumOp
+        # The CPU implementation is not so accurate, which throws out DebugMode.
+        # Since propagating .tag.values_eq_approx to the output of every
+        # GpuFromHost seems overkill, we just relax the rtol for these tests
+        self.old_rtol = theano.tensor.float32_rtol
+        theano.tensor.basic.float32_rtol *= 2
+    def tearDown(self):
+        super(TestGpuCumOp, self).tearDown()
+        # Restore rtol
+        theano.tensor.basic.float32_rtol = self.old_rtol
    @cum_modes
    def test_infer_shape(self, mode):
        # GpuCumOp is only defined for float32 for now, so we skip it

--- a/theano/gpuarray/tests/test_multinomial.py
+++ b/theano/gpuarray/tests/test_multinomial.py
@@ -327,30 +327,34 @@ def test_gpu_opt_wor():
    p = tensor.fmatrix()
    u = tensor.fvector()
    n = tensor.iscalar()
-    m = multinomial.ChoiceFromUniform(odtype='auto')(p, u, n)
+    for replace in [False, True]:
-    assert m.dtype == 'int64', m.dtype
+        m = multinomial.ChoiceFromUniform(odtype='auto',
+                                          replace=replace)(p, u, n)
-    f = function([p, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
+        assert m.dtype == 'int64', m.dtype
-    assert any([type(node.op) is GPUAChoiceFromUniform
-                for node in f.maker.fgraph.toposort()])
+        f = function([p, u, n], m, allow_input_downcast=True,
-    n_samples = 3
+                     mode=mode_with_gpu)
-    pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+        assert any([type(node.op) is GPUAChoiceFromUniform
-    pval = pval / pval.sum(axis=1)[:, None]
+                    for node in f.maker.fgraph.toposort()])
-    uval = np.ones(pval.shape[0] * n_samples) * 0.5
+        n_samples = 3
-    f(pval, uval, n_samples)
+        pval = np.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+        pval = pval / pval.sum(axis=1)[:, None]
+        uval = np.ones(pval.shape[0] * n_samples) * 0.5
+        f(pval, uval, n_samples)
-    # Test with a row, it was failing in the past.
+        # Test with a row, it was failing in the past.
-    r = tensor.frow()
+        r = tensor.frow()
-    m = multinomial.ChoiceFromUniform('auto')(r, u, n)
+        m = multinomial.ChoiceFromUniform('auto', replace=replace)(r, u, n)
-    assert m.dtype == 'int64', m.dtype
+        assert m.dtype == 'int64', m.dtype
-    f = function([r, u, n], m, allow_input_downcast=True, mode=mode_with_gpu)
+        f = function([r, u, n], m, allow_input_downcast=True,
-    assert any([type(node.op) is GPUAChoiceFromUniform
+                     mode=mode_with_gpu)
-                for node in f.maker.fgraph.toposort()])
+        assert any([type(node.op) is GPUAChoiceFromUniform
-    pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
+                    for node in f.maker.fgraph.toposort()])
-    pval = pval / pval.sum(axis=1)[:, None]
+        pval = np.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
-    uval = np.ones_like(pval[:, 0]) * 0.5
+        pval = pval / pval.sum(axis=1)[:, None]
-    f(pval, uval, 1)
+        uval = np.ones_like(pval[:, 0]) * 0.5
+        f(pval, uval, 1)
 def test_unpickle_legacy_op():

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -212,7 +212,7 @@ class ChoiceFromUniform(MultinomialFromUniform):
    """
-    __props__ = ("replace",)
+    __props__ = ("odtype", "replace",)
    def __init__(self, odtype, replace=False, *args, **kwargs):
        self.replace = replace

--- a/theano/scan_module/tests/test_scan_checkpoints.py
+++ b/theano/scan_module/tests/test_scan_checkpoints.py
@@ -35,14 +35,14 @@ class TestScanCheckpoint(unittest.TestCase):
        self.grad_A_check = T.grad(self.result_check.sum(), self.A)
    def test_forward_pass(self):
-        """Test forward computation of A**k."""
+        # Test forward computation of A**k.
        f = theano.function(inputs=[self.A, self.k],
                            outputs=[self.result, self.result_check])
        out, out_check = f(range(10), 101)
        assert np.allclose(out, out_check)
    def test_backward_pass(self):
-        """Test gradient computation of A**k."""
+        # Test gradient computation of A**k.
        f = theano.function(inputs=[self.A, self.k],
                            outputs=[self.grad_A, self.grad_A_check])
        out, out_check = f(range(10), 101)
@@ -50,7 +50,7 @@ class TestScanCheckpoint(unittest.TestCase):
    @unittest.skipUnless(PYGPU_AVAILABLE, 'Requires pygpu.')
    def test_memory(self):
-        """Test that scan_checkpoint reduces memory usage."""
+        # Test that scan_checkpoint reduces memory usage.
        if None not in theano.gpuarray.type.list_contexts():
            return unittest.SkipTest('Requires gpuarray backend.')
        from theano.gpuarray.tests.config import mode_with_gpu  # noqa
@@ -63,9 +63,11 @@ class TestScanCheckpoint(unittest.TestCase):
        # Check that it works with the checkpoints
        f_check(data, 1000)
        # Check that the basic scan fails in that case
-        self.assertRaises(GpuArrayException, f, data, 1000)
+        # Skip that check in DebugMode, as it can fail in different ways
+        if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+            self.assertRaises(GpuArrayException, f, data, 1000)
    def test_taps_error(self):
-        """Test that an error rises if we use taps in outputs_info."""
+        # Test that an error rises if we use taps in outputs_info.
        self.assertRaises(RuntimeError, theano.scan_checkpoints,
                          lambda: None, [], {'initial': self.A, 'taps': [-2]})
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -2001,7 +2001,7 @@ class AdvancedIncSubtensor1(Op):
        if self.set_instead_of_inc:
            x[idx] = y
        else:
-            if config.cxx:
+            if config.cxx and node.inputs[0].dtype != 'float16':
                increment = inplace_increment
            else:
                increment = self.inplace_increment1d_slow

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -889,8 +889,10 @@ class T_reduce_dtype(unittest.TestCase):
                        (topo, output_dtype)
                    data = np.random.rand(3, 4) * 10
                    data = data.astype(input_dtype)
-                    if output_dtype == 'float16' and method == 'prod':
+                    if (method == 'prod' and output_dtype in
+                            ['float16', 'int8', 'uint8', 'int16', 'uint16']):
                        # We will likely get something infinite,
+                        # or the overflow will be different between CPU and GPU,
                        # and DebugMode will complain.
                        data = data[0:1]
                    f(data)