Merge pull request #1538 from nouiz/doc

Better error handling and buildbot

Merge pull request #1538 from nouiz/doc
2e7d332f · Pascal Lamblin · 14ab4945 · 0a39f9e7 · 2e7d332f · 2e7d332f
--- a/doc/internal/metadocumentation.txt
+++ b/doc/internal/metadocumentation.txt
@@ -123,10 +123,11 @@ The user ``lisa`` runs a cronjob on the computer ``ceylon``,  this
 happens nightly. (To have the crontab executed, the ``lisa`` user must
 be logged into ``ceylon``, Fred leaves a shell open for that.)

-The cronjob executes the scripts
-``~/nightly_build/do_nightly_build_{theano,pylearn,deeplearning}``.
-These scripts perform an update of theano (and pylearn, and
-DeepLearningTutorials too), and execute theano-nose (in various settings).
+The cronjob executes a script that download/update the repo of Theano,
+Pylearn, Pylearn2 and the Deep Learning Tutorial, then run their tests
+script under ``*/misc/do_nightly_build``. Those script tests the
+project under various condition. The cron job also run some tests in
+Python 2.4 and Python 3.3 for Theano.

 The output is emailed automatically to the `theano-buildbot`_ mailing list.


--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -808,7 +808,7 @@ class ProfileStats(object):
        elif self.fct_callcount > 0:
            print >> file, ("  No execution time accumulated "
                            "(hint: try config profiling.time_thunks=1)")
-        if config.profile_memory:
+        if self.variable_shape or self.variable_strides:
            self.summary_memory(file, n_apply_to_print)
        if self.optimizer_profile:
            print "Optimizer Profile"

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -201,31 +201,31 @@ if __name__ == "__main__":

        Test time in float32

-        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
+        cuda version      5.5    5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
-        K20m/ECC          0.07s
-        K20/NOECC         0.07s
-        M2070             0.25s         0.27s         0.32s
-        M2050(Amazon)     0.25s
-        C2075                    0.25s
-        C1060                                         0.46s
-
-        GTX Titan(D15U-50)0.06s  don't work
-        GTX 680           0.12s  0.154s               0.218s
-        GTX 580           0.16s  0.164s               0.203s
-        GTX 480           0.19s  0.192s               0.237s 0.27s
-        GTX 470           0.23s  0.238s               0.297s 0.34s
-        GTX 660           0.20s  0.23s
-        GTX 560                  0.30s
-        GTX 650 Ti        0.27s
-        GTX 460           0.37s                0.45s
-        GTX 285                  0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
-        GTX 550 Ti                             0.57s
-        GT 520                   2.68s                3.06s
-        520M              2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
-        GT 220                                        3.80s
-        GT 210                                 6.35s
-        8500 GT                                              10.68s
+        K20m/ECC                 0.07s
+        K20/NOECC                0.07s
+        M2070                    0.25s         0.27s         0.32s
+        M2050(Amazon)            0.25s
+        C2075                           0.25s
+        C1060                                                0.46s
+
+        GTX Titan(D15U-50)0.06s  0.06s  don't work
+        GTX 680                  0.12s  0.154s               0.218s
+        GTX 580           0.16s  0.16s  0.164s               0.203s
+        GTX 480           0.19s  0.19s  0.192s               0.237s 0.27s
+        GTX 470           0.23s  0.23s  0.238s               0.297s 0.34s
+        GTX 660           0.18s  0.20s  0.23s
+        GTX 560                         0.30s
+        GTX 650 Ti               0.27s
+        GTX 460                  0.37s                0.45s
+        GTX 285           0.42s         0.452s        0.452s        0.40s # cuda 3.0 seems faster? driver version?
+        GTX 550 Ti                                    0.57s
+        GT 520                          2.68s                3.06s
+        520M                     2.44s                       3.19s        # with bumblebee on Ubuntu 12.04
+        GT 220                                               3.80s
+        GT 210                                        6.35s
+        8500 GT                                                     10.68s
        """

    t, impl = execute(not options.print_only, not options.quiet,

--- a/theano/misc/do_nightly_build
+++ b/theano/misc/do_nightly_build
@@ -54,14 +54,6 @@ if [ "$RELEASE" ]; then
    echo
 fi

-# with --batch=1000" # The buildbot freeze sometimes when collecting the tests to run
-echo "Executing tests with mode=FAST_COMPILE"
-echo "THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}"
-THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}
-echo "Number of elements in the compiledir:"
-ls ${COMPILEDIR}|wc -l
-echo
-
 echo "Executing tests with mode=FAST_RUN"
 echo "THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}"
 THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
@@ -89,6 +81,16 @@ echo "Executing tests with mode=DEBUG_MODE with seed of the day $seed"
 echo "THEANO_FLAGS=${FLAGS},unittests.rseed=$seed,mode=DEBUG_MODE,DebugMode.check_strides=0,DebugMode.patience=3,DebugMode.check_preallocated_output= ${NOSETESTS} ${ARGS}"
 THEANO_FLAGS=${FLAGS},unittests.rseed=$seed,mode=DEBUG_MODE,DebugMode.check_strides=0,DebugMode.patience=3,DebugMode.check_preallocated_output= ${NOSETESTS} ${ARGS}

+#We put this at the end as it have a tendency to loop infinitly.
+#Until we fix the root of the problem we let the rest run, then we can kill this one in the morning.
+# with --batch=1000" # The buildbot freeze sometimes when collecting the tests to run
+echo "Executing tests with mode=FAST_COMPILE"
+echo "THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}"
+THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS} ${ARGS}
+echo "Number of elements in the compiledir:"
+ls ${COMPILEDIR}|wc -l
+echo
+
 echo "Number of elements in the compiledir:"
 ls ${COMPILEDIR}|wc -l
 echo

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -179,6 +179,7 @@ if compile_cuda_ndarray and cuda_available:
 del compile_cuda_ndarray

 if cuda_available:
+    global cuda_initialization_error_message
    # The module should be compiled.
    from cuda_ndarray.cuda_ndarray import *


--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -21,7 +21,8 @@
 //If true, we do error checking at the start of functions, to make sure there
 //is not a pre-existing error when the function is called.
 //You probably need to set the environment variable
-//CUDA_LAUNCH_BLOCKING=1
+//CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
+//preprocessor macro in cuda_ndarray.cuh
 //if you want this to work.
 #define PRECHECK_ERROR 0

@@ -69,7 +70,10 @@ void * device_malloc(size_t size, int verbose)
        cudaError_t prevError = cudaGetLastError();
        if (cudaSuccess != prevError)
        {
-            fprintf(stderr, "Error existed before calling device_malloc.\n");
+            fprintf(stderr,
+                    "Error existed before calling device_malloc. %s\n",
+                    cudaGetErrorString(prevError)
+                    );
        }
    #endif
    void * rval=NULL;
@@ -155,7 +159,10 @@ int device_free(void *ptr)
        cudaError_t prevError = cudaGetLastError();
        if (cudaSuccess != prevError)
        {
-            fprintf(stderr, "Error existed before calling device_free.\n");
+            fprintf(stderr,
+                    "Error existed before calling device_free. %s\n",
+                    cudaGetErrorString(prevError)
+                    );
        }
    #endif
    #if PRINT_FREE_MALLOC
@@ -232,6 +239,14 @@ int device_free(void *ptr)
                    ptr,
                    cudaGetErrorString(err), free, total);
        #endif
+        if (NULL != PyErr_Occurred()){
+            fprintf(stderr,
+                    "device_free: cudaFree() returned an error, but there is already an"
+                    " Python error set. This happen during the clean up when there is a"
+                    " first error and the CUDA driver is in a so bad state that it don't"
+                    " work anymore. We keep the previous error set to help debugging it.");
+            return -1;
+        }
        PyErr_Format(PyExc_MemoryError,
                "error freeing device pointer %p (%s)",
                ptr,
@@ -3878,6 +3893,22 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1;
    }

+#if PRECHECK_ERROR
+    cublasStatus prevErr = cublasGetError();
+    if (CUBLAS_STATUS_SUCCESS != prevErr)
+    {
+        //I don't know why, but I need to remove the cuda error too.
+        //Otherwise, the clean up before raising the Python error cause error too!
+        //So we don't see this python error.
+        fprintf(stderr,
+                "CudaNdarray_sgemm: Prev cublas error %s",
+                cublasGetErrorString(prevErr));
+        PyErr_Format(PyExc_RuntimeError,
+                     "CudaNdarray_sgemm: Prev cublas error %s",
+                     cublasGetErrorString(prevErr));
+        return -1;
+    }
+#endif
    // We must allow dimensions to be zeros.
    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
@@ -4035,8 +4066,14 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if (CUBLAS_STATUS_SUCCESS != err)
    {
        PyErr_Format(PyExc_RuntimeError,
-                     "cublasSgemm failed (%i)",
-                     err);
+                     "cublasSgemm failed (%i) %s\n"
+                     " unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%f, b=%f, c=%f"
+                     " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
+                     err,  cublasGetErrorString(err),
+                     unit, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1],
+                     CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
+                     alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
+
        return -1;
    }
    return 0;

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -386,8 +386,6 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const i
        CudaNdarray_set_nd(self, -1);
        self->data_allocated = 0;
        self->devdata = 0;
-        PyErr_SetString(PyExc_RuntimeError,
-                        "Could not allocate memory on device");
        return -1;
    }
    if (0)
@@ -530,6 +528,25 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
        const int * dims);

+DllExport const char* cublasGetErrorString(cublasStatus err){
+    if(CUBLAS_STATUS_SUCCESS == err)
+        return "success";
+    else if(CUBLAS_STATUS_NOT_INITIALIZED == err)
+        return "the library was not initialized";
+    else if(CUBLAS_STATUS_ALLOC_FAILED == err)
+        return "the resource allocation failed";
+    else if(CUBLAS_STATUS_INVALID_VALUE == err)
+        return "the parameters n<0 or incx,incy=0";
+    else if(CUBLAS_STATUS_MAPPING_ERROR == err)
+        return "an access to GPU memory space failed";
+    else if(CUBLAS_STATUS_EXECUTION_FAILED == err)
+        return "the function failed to launch on the GPU";
+    else if(CUBLAS_STATUS_INTERNAL_ERROR == err)
+        return "an internal operation failed";
+    else
+        return "unknow code";
+}
+
 #endif
 /*
  Local Variables: