Merge pull request #580 from nouiz/gpu_setsubtensor

Gpu setsubtensor

Merge pull request #580 from nouiz/gpu_setsubtensor
5890e98e · lamblin · 780813c7 · 241f4ffb · 5890e98e · 5890e98e
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -9,6 +9,13 @@ Bug fixes
   (both in Python and Cython) since April 2011. (Pascal L.)
 * In Sparse sandbox, fix the grad of theano.sparse.sandbox.sp.row_scale.
   It did not return the right number of elements. (Frederic B.)
+ * set_subtensor(x[int vector], new_value) when moved to the GPU
+   where transformed into inc_subtensor on the GPU. Now we have a slow
+   GPU implementation.
+   Note: set_subtensor(x[slice[,...]], new_value) was working correctly
+   in all case as well as inc_subtensor(*, *).
+   Note2: If your code have this behavior, we print a warning by default.
+  (Frederic B.)

 Documentation
 * Added in the tutorial documentation on how to extend Theano.
@@ -81,6 +88,8 @@ Crash Fix
   element-wise fusion optimization when upcasting some inputs to
   float32 (to compute them on the GPU).
   (Frederic B., reported by Sander Dieleman)
+ * GpuReshape in some particular case when the input is not contiguous
+   (Frederic B., reported by Sander Dieleman)
 * GpuSoftmaxWithBias with shape (0, N) with N > 1.
   (Frédéric B., reported by Razvan P.)
 * Fix crash under 64-bit Windows, when taking subtensors of the form a[n:]
@@ -89,6 +98,7 @@ Crash Fix
   dimensions, which could typically result in optimization crashes (Olivier D.)
 * Fixed crash when concatenating some arrays with specific broadcasting
   patterns (Olivier D.)
+ * Work around a known issue with nvcc 4.1 on MacOS X. (Graham Taylon)

 =============
 Release Notes

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -315,6 +315,13 @@ AddConfigVar('warn.subtensor_merge_bug',
        BoolParam(warn_default('0.5')),
        in_c_key=False)

+AddConfigVar('warn.gpu_set_subtensor1',
+        "Warn if previous versions of Theano (before 0.6) could have given "
+        "incorrect results when moving to the gpu"
+        "set_subtensor(x[int vector], new_value)",
+        BoolParam(warn_default('0.6')),
+        in_c_key=False)
+
 AddConfigVar('compute_test_value',
        ("If 'True', Theano will run each op at graph build time, using "
         "Constants, SharedVariables and the tag 'test_value' as inputs "

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1002,21 +1002,40 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            self->nd, other->nd);
    }

-    //standard elemwise size checks
-    if (self->nd != other->nd)
+    //standard elemwise nb dim checks
+    if (self->nd < other->nd)
    {
        PyErr_Format(
            PyExc_TypeError,
-            "CudaNdarray_inplace_elemwise: need same number of dims. Got %d and %d",
+            "CudaNdarray_inplace_elemwise: The destination need more or the"
+            " same number of dimensions then the source. Got %d and %d.",
            self->nd, other->nd);
        return -1;
    }
+
+    //broadcast to the same number of dimensions.
+    int other_dims[self->nd];
+    int other_strides[self->nd];
+    int added_dims = self->nd - other->nd;
+    // Add the added broadcasted dimensions
+    for (int i = 0; i< added_dims; ++i)
+    {
+        other_dims[i] = 1;
+        other_strides[i] = 0;
+    }
+    // Copy the existing dimensions
+    for (int i = 0; i< other->nd; ++i)
+    {
+        other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
+        other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
+    }
+
    //standard elemwise dim checks
    unsigned int size = 1;
    for (int i = 0; i< self->nd; ++i)
    {
-        if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
-            && (CudaNdarray_HOST_DIMS(other)[i] != 1))
+        if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
+            && (other_dims[i] != 1))
        {
            PyErr_SetString(
                PyExc_ValueError,
@@ -1024,8 +1043,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            return -1;
        }
        // if we're broadcasting other, then make sure it has stride 0
-        assert ((CudaNdarray_HOST_DIMS(self)[i] == CudaNdarray_HOST_DIMS(other)[i])
-            || (CudaNdarray_HOST_STRIDES(other)[i] == 0));
+        assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
+            || (other_strides[i] == 0));
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
    }

@@ -1090,7 +1109,7 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        CudaNdarray_DEV_DATA(other),
                        1, //strides
                        1,
-                        CudaNdarray_HOST_STRIDES(other)[0]);
+                        other_strides[0]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if (cudaSuccess != err)
@@ -1126,8 +1145,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        CudaNdarray_HOST_STRIDES(self)[1],
                        CudaNdarray_DEV_DATA(other),
                        1,
-                        CudaNdarray_HOST_STRIDES(other)[0],
-                        CudaNdarray_HOST_STRIDES(other)[1]);
+                        other_strides[0],
+                        other_strides[1]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if (cudaSuccess != err)
@@ -1165,9 +1184,9 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        CudaNdarray_HOST_STRIDES(self)[1],
                        CudaNdarray_HOST_STRIDES(self)[2],
                        CudaNdarray_DEV_DATA(other),
-                        CudaNdarray_HOST_STRIDES(other)[0],
-                        CudaNdarray_HOST_STRIDES(other)[1],
-                        CudaNdarray_HOST_STRIDES(other)[2]);
+                        other_strides[0],
+                        other_strides[1],
+                        other_strides[2]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if (cudaSuccess != err)
@@ -1208,10 +1227,10 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                        CudaNdarray_HOST_STRIDES(self)[2],
                        CudaNdarray_HOST_STRIDES(self)[3],
                        CudaNdarray_DEV_DATA(other),
-                        CudaNdarray_HOST_STRIDES(other)[0],
-                        CudaNdarray_HOST_STRIDES(other)[1],
-                        CudaNdarray_HOST_STRIDES(other)[2],
-                        CudaNdarray_HOST_STRIDES(other)[3]);
+                        other_strides[0],
+                        other_strides[1],
+                        other_strides[2],
+                        other_strides[3]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if (cudaSuccess != err)
@@ -1252,11 +1271,11 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
                            CudaNdarray_HOST_STRIDES(self)[2],
                            CudaNdarray_HOST_STRIDES(self)[3],
                            CudaNdarray_HOST_STRIDES(self)[4],
-                            CudaNdarray_DEV_DATA(other) + i * CudaNdarray_HOST_STRIDES(other)[0],
-                            CudaNdarray_HOST_STRIDES(other)[1],
-                            CudaNdarray_HOST_STRIDES(other)[2],
-                            CudaNdarray_HOST_STRIDES(other)[3],
-                            CudaNdarray_HOST_STRIDES(other)[4]);
+                            CudaNdarray_DEV_DATA(other) + i * other_strides[0],
+                            other_strides[1],
+                            other_strides[2],
+                            other_strides[3],
+                            other_strides[4]);
                    CNDA_THREAD_SYNC;
                    cudaError_t err = cudaGetLastError();
                    if( cudaSuccess != err)
@@ -1280,6 +1299,8 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
            return -1;
        }
    }
+    if (verbose)
+        fprintf(stderr, "INPLACE ADD/DIV end\n");
    return 0;
 }

@@ -2746,12 +2767,17 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
    //standard elemwise size checks
    if (self->nd == -1)
    {
-        PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized CudaNdarray");
+        PyErr_SetString(PyExc_TypeError,
+                        "can't copy into un-initialized CudaNdarray");
        return -1;
    }
    if (self->nd != other->nd)
    {
-        PyErr_Format(PyExc_NotImplementedError, "CudaNdarray_CopyFromCudaNdarray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", self->nd, other->nd);
+        PyErr_Format(PyExc_NotImplementedError,
+                     "CudaNdarray_CopyFromCudaNdarray: need same number of"
+                     " dims. destination nd=%d, source nd=%d."
+                     " No broadcasting implemented.",
+                     self->nd, other->nd);
        return -1;
    }
    //standard elemwise dim checks (also compute total size)
@@ -2762,8 +2788,11 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
        if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
            && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
        {
-          PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%d, source=%d",
-                       i, CudaNdarray_HOST_DIMS(self)[i], CudaNdarray_HOST_DIMS(other)[i]);
+          PyErr_Format(PyExc_ValueError,
+                       "need same dimensions for dim %d,"
+                       " destination=%d, source=%d",
+                       i, CudaNdarray_HOST_DIMS(self)[i],
+                       CudaNdarray_HOST_DIMS(other)[i]);
            return -1;
        }
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
@@ -2773,12 +2802,15 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
    {
        return 0; //nothing to copy, we're done.
    }
-    if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
+    if (CudaNdarray_is_c_contiguous(self) &&
+        CudaNdarray_is_c_contiguous(other) &&
+        size == size_source)
    {
        if (verbose)
            fprintf(stderr, "Copying contiguous vector with cublasScopy\n");

-        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
+        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
+                    CudaNdarray_DEV_DATA(self), 1);
        CNDA_THREAD_SYNC;
        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
        {
@@ -2800,23 +2832,33 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
            {
                if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
                if (verbose) fprint_CudaNdarray(stderr, other);
-                unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
-                unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
+                unsigned int n_blocks = std::min(size,
+                                                 (unsigned int)NUM_VECTOR_OP_BLOCKS);
+                unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
+                                                  (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
                k_copy_1d<<<n_blocks, n_threads>>>(size,
-                        CudaNdarray_DEV_DATA(other), CudaNdarray_HOST_STRIDES(other)[0],
-                        CudaNdarray_DEV_DATA(self), CudaNdarray_HOST_STRIDES(self)[0]);
+                                            CudaNdarray_DEV_DATA(other),
+                                            CudaNdarray_HOST_STRIDES(other)[0],
+                                            CudaNdarray_DEV_DATA(self),
+                                            CudaNdarray_HOST_STRIDES(self)[0]);
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads);
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %s: %s. (n_blocks=%i,"
+                                 " n_threads_per_block=%i)\n", "k_copy_1d",
+                                 cudaGetErrorString(err), n_blocks, n_threads);
                    return -1;
                }
            }; break;
        default:
            {
                assert (cudaSuccess == cudaGetLastError());
-                if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast);
+                if (verbose)
+                    fprintf(stderr,
+                            "Copying with default version unbroadcast=%d\n",
+                            unbroadcast);
                // call worker routine
                unsigned int threads_per_block = std::min(size,
                                                          (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
@@ -2830,18 +2872,27 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
                        size,
                        (unsigned int)other->nd,
                        (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
-                        (const float*)CudaNdarray_DEV_DATA(other), (const int *)CudaNdarray_DEV_STRIDES(other),
-                        CudaNdarray_DEV_DATA(self),  (const int *)CudaNdarray_DEV_STRIDES(self));
+                        (const float*)CudaNdarray_DEV_DATA(other),
+                        (const int *)CudaNdarray_DEV_STRIDES(other),
+                        CudaNdarray_DEV_DATA(self),
+                        (const int *)CudaNdarray_DEV_STRIDES(self));
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if(verbose>1)
-                    fprintf(stderr, "INFO k_elemwise_unary_rowmaj (n_blocks=%i, n_threads_per_block=%i)\n",
+                    fprintf(stderr,
+                            "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
+                            " n_threads_per_block=%i)\n",
                            n_blocks, threads_per_block);
                if( cudaSuccess != err)
                {
                    //fprint_CudaNdarray(stderr, self);
                    //fprint_CudaNdarray(stderr, other);
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block);
+                    PyErr_Format(PyExc_RuntimeError,
+                                 "Cuda error: %s: %s. (n_blocks=%i,"
+                                 " n_threads_per_block=%i)\n",
+                                 "k_elemwise_unary_rowmajor_copy",
+                                 cudaGetErrorString(err), n_blocks,
+                                 threads_per_block);
                    return -1;
                }
            }
@@ -3832,7 +3883,7 @@ cnda_mark_dev_structure_dirty(CudaNdarray * self)
 int
 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
 {
-    int verbose = 1;
+    int verbose = 0;

    if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
    {

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -82,17 +82,24 @@ class InputToGpuOptimizer(Optimizer):

    def apply(self, env):
        for input in env.inputs:
-            if not isinstance(input.type, CudaNdarrayType):
-                try:
-                    new_input = host_from_gpu(gpu_from_host(input))
+            if isinstance(input.type, CudaNdarrayType):
+                return

-                    if new_input.type == input.type:
-                        env.replace_validate(input, new_input,
-                                             "InputToGpuOptimizer")
-                except TypeError, e:
-                    #as we currently only support float32, this can fail.
-                    #Using try except make that we won't need
-                    pass
+            # This happen frequently as we do 2 pass of the gpu optimizations
+            if (len(input.clients) == 1 and
+                input.clients[0][0].op == gpu_from_host):
+                return
+
+            try:
+                new_input = host_from_gpu(gpu_from_host(input))
+
+                if new_input.type == input.type:
+                    env.replace_validate(input, new_input,
+                                         "InputToGpuOptimizer")
+            except TypeError, e:
+                #as we currently only support float32, this can fail.
+                #Using try except make that we won't need
+                pass

 # we register it before all other gpu optimizer to be sure that the input
 # are on the gpu.
@@ -753,11 +760,11 @@ def local_gpu_advanced_incsubtensor1(node):
                warnings.warn(
                    'Although your current code is fine, please note that '
                    'Theano versions prior to 0.6 (more specifically, '
-                    'prior to commit XXXX on DATE) may have '
+                    'prior to commitd 2240bddd on March 29, 2012) may have '
                    'yielded an incorrect result. To remove this warning, '
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
-                    'least \'0.6\'.')
+                    'least \'0.6\'.', stacklevel=1)
            if set_instead_of_inc:
                return

@@ -787,7 +794,7 @@ def local_gpu_advanced_incsubtensor1(node):
                warnings.warn(
                    'Although your current code is fine, please note that '
                    'Theano versions prior to 0.6 (more specifically, '
-                    'prior to commit XXXX on DATE) may have '
+                    'prior to commit d2240bddd on March 29, 2012) may have '
                    'yielded an incorrect result. To remove this warning, '
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2100,23 +2100,20 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        return super(T_subtensor, self).__init__(name)

    def function(self, inputs, outputs, accept_inplace=False,
-                 op=None, mode=None, N=1):
+                 op=None, mode=None, N=1, N_fast=None):
        """ wrapper around theano.function that also check the output

        :param N: the number of op expected in the toposort
                  if tuple of length 2, (expected if fast_compile,
                                         if not fast_compile)
        """
-        if isinstance(N, tuple):
-            assert len(N) == 2
-            if self.fast_compile:
-                N = N[0]
-            else:
-                N = N[1]
+        if self.fast_compile and N_fast is not None:
+            N = N_fast
        if mode is None:
            mode = self.mode
        if op is None:
            op = self.sub
+
        f = theano.function(inputs, outputs, mode=mode,
                            accept_inplace=accept_inplace)
        self.assertFunctionContainsClassN(f, op, N)
@@ -2694,7 +2691,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            if idx is idxs[0]:
                f = self.function([], [gn.shape, n[idx_].shape],
                                  op=ops,
-                                  N=(2, 0))
+                                  N=0, N_fast=2)
                f()

    def test_wrong_exception_regression(self):
@@ -2747,7 +2744,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            data = numpy.asarray(data, dtype=self.dtype)
            n = self.shared(data)
            t = n[idx]
-            f = self.function([], t.shape, op=self.ops, N=(1, 0))
+            f = self.function([], t.shape, op=self.ops, N=0, N_fast=1)
            val = f()
            self.assertTrue(numpy.allclose(val, data[idx].shape))

@@ -2850,6 +2847,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                                    data_copy[idx] = inc_num
                                else:
                                    data_copy[idx] += inc_num
+                        data_var = theano.In(data_var, mutable=True)
+
                        # Remember data for the Theano function (see below).
                        all_inputs_var += [data_var, idx_var, inc_var]
                        all_inputs_num += [data_num, idx_num, inc_num]
@@ -2869,9 +2868,16 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                                assert (data_num == data_num_init).all()

        # Actual test (we compile a single Theano function to make it faster).
-        f = self.function(all_inputs_var, all_outputs_var,
-                          accept_inplace=True, op=self.adv_incsub1,
-                          N=len(all_outputs_var))
+        orig_warn = theano.config.warn.gpu_set_subtensor1
+        try:
+            theano.config.warn.gpu_set_subtensor1 = False
+            f = self.function(all_inputs_var, all_outputs_var,
+                              accept_inplace=True,
+                              op=self.adv_incsub1,
+                              N=len(all_outputs_var))
+        finally:
+            theano.config.warn.gpu_set_subtensor1 = orig_warn
+
        f_outs = f(*all_inputs_num)
        assert len(f_outs) == len(all_outputs_num)
        for f_out, output_num in izip(f_outs, all_outputs_num):

--- a/theano/tests/unittest_tools.py
+++ b/theano/tests/unittest_tools.py
@@ -93,7 +93,8 @@ class TestOptimizationMixin(object):
    def assertFunctionContains(self, f, op, min=1, max=sys.maxint):
        toposort = f.maker.env.toposort()
        matches = [node for node in toposort if node.op == op]
-        assert (min <= len(matches) <= max), (toposort, matches, str(op), min, max)
+        assert (min <= len(matches) <= max), (toposort, matches,
+                                              str(op), len(matches), min, max)

    def assertFunctionContains0(self, f, op):
        return self.assertFunctionContains(f, op, min=0, max=0)
@@ -104,6 +105,15 @@ class TestOptimizationMixin(object):
    def assertFunctionContainsN(self, f, op, N):
        return self.assertFunctionContains(f, op, min=N, max=N)

+    def assertFunctionContainsClass(self, f, op, min=1, max=sys.maxint):
+        toposort = f.maker.env.toposort()
+        matches = [node for node in toposort if isinstance(node.op, op)]
+        assert (min <= len(matches) <= max), (toposort, matches,
+                                              str(op), len(matches), min, max)
+
+    def assertFunctionContainsClassN(self, f, op, N):
+        return self.assertFunctionContainsClass(f, op, min=N, max=N)
+
    def SkipTest(self, msg='Skip this test'):
        raise SkipTest(msg)