Merge pull request #1442 from nouiz/mixed2

[WIP] Use the new grad interface.

Merge pull request #1442 from nouiz/mixed2
85db6f61 · lamblin · 51b39ada · 3a1e910c · 85db6f61 · 85db6f61
--- a/doc/install_ubuntu.txt
+++ b/doc/install_ubuntu.txt
@@ -124,6 +124,27 @@ Do like in the section "Updating Theano", but use

 .. _install_ubuntu_gpu:

+Manual Openblas instruction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The openblas included in Ubuntu is limited to 2 threads. If you want
+to use more cores at the same time, you will need to compile it
+yourself. Here is some code that will help you.
+
+.. code-block:: bash
+
+    # remove openblas if you installed it
+    sudo apt-get remove libopenblas-base
+    # Download the development version of OpenBLAS
+    git clone git://github.com/xianyi/OpenBLAS
+    cd OpenBLAS
+    make FC=gfortran
+    sudo make PREFIX=/usr/local/ install
+    cd /usr/local/lib
+    ln -s libopenblas.so /usr/lib/libblas.so
+    ln -s libopenblas.so.0 /usr/lib/libblas.so.3gf
+
+
 Contributed GPU instruction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~


--- a/theano/gradient.py
+++ b/theano/gradient.py
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            float * sm_data, int sms0, int sms1,
            float * am_data, int ams0)
        {
-            const int row = blockIdx.x;
+          for (int row = blockIdx.x; row < M; row += gridDim.x){

            const float * x = x_data + xs0 * row;
            const int y_idx = (int)y_idx_data[row * y_idxs0];
@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
                           + log(sum);
            }
            am_data[row*ams0] = row_max_j;
+          }
        }

        """
@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            }
        }
        {
-            int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
+                                    NUM_VECTOR_OP_BLOCKS);
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(float);
@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n",
-                             cudaGetErrorString(err));
+                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
+                             "The kernel was launched with %%d threads,"
+                             " %%d blocks and %%d shared memory\\n",
+                             cudaGetErrorString(err),
+                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):

    def c_code_cache_version(self):
        #return ()
-        return (3,)
+        return (4,)

 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()

@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):

    def c_code_cache_version(self):
        #return ()
-        return (5,)
+        return (6,)

    def c_code(self, node, nodename, inp, out, sub):
        dnll, sm, y_idx = inp
@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            }
        }
        {
+            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
+                                    NUM_VECTOR_OP_BLOCKS);
+            int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
+
            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
-                <<<
-                    CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                    std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
-                >>>(
+                <<<n_blocks, n_threads>>>(
                        CudaNdarray_HOST_DIMS(%(dx)s)[0],
                        CudaNdarray_HOST_DIMS(%(dx)s)[1],

@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
            if( cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
-                             "Cuda error: %%s: %%s.\\n",
+                             "Cuda error: %%s: %%s.\\n"
+                             "The kernel was launched with %%d threads and"
+                             " %%d blocks\\n",
                             "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
-                             cudaGetErrorString(err));
+                             cudaGetErrorString(err), n_threads, n_blocks);
                %(fail)s;
            }
        }

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias

    We check that we loop when their is too much threads
-    TODO: check that we loop when their is too much block(>32*1024)

    """

@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
    This is basic test for GpuCrossentropySoftmax1HotWithBiasDx

    We check that we loop when their is too much threads
-    TODO: check that we loop when their is too much block(>32*1024)

    """
    n_in = 1000
    batch_size = 4097
    n_out = 1250

+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
+        n_in = 4098
+        n_out = 4099
+
    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()


--- a/theano/sandbox/linalg/ops.py
+++ b/theano/sandbox/linalg/ops.py
@@ -715,10 +715,9 @@ class ExtractDiag(Op):
        implemented our own. """
        x, = ins
        z, = outs
-
        # zero-dimensional matrices ...
        if x.shape[0] == 0 or x.shape[1] == 0:
-            z[0] = numpy.zeros(0, dtype=x.dtype)
+            z[0] = node.outputs[0].type.value_zeros((0,))
            return

        if x.shape[0] < x.shape[1]:

--- a/theano/sandbox/linalg/tests/test_linalg.py
+++ b/theano/sandbox/linalg/tests/test_linalg.py
@@ -204,8 +204,8 @@ def test_rop_lop():
    rop_f = function([mx, mv], yv)

    sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
-                       sequences=tensor.arange(y.shape[0]),
-                       non_sequences=[y, mx, mv])
+                        sequences=tensor.arange(y.shape[0]),
+                        non_sequences=[y, mx, mv])
    scan_f = function([mx, mv], sy)

    rng = numpy.random.RandomState(utt.fetch_seed())
@@ -561,6 +561,7 @@ class test_Eigh(test_Eig):
 class test_Eigh_float32(test_Eigh):
    dtype = 'float32'

+
 def test_matrix_inverse_solve():
    if not imported_scipy:
        raise SkipTest("Scipy needed for the Solve op.")

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
--- a/theano/tensor/sort.py
+++ b/theano/tensor/sort.py
@@ -144,7 +144,17 @@ class ArgSortOp(theano.Op):

    def grad(self, inputs, output_grads):
        #No grad defined for intergers.
-        return [None, None]
+        inp, axis = inputs
+        inp_grad = theano.gradient.grad_not_implemented(
+            self, 0, axis,
+            "I'm not sure if argsort should have its gradient"
+            " implemented or is should be marked as undefined."
+            " So I mark it as not implemented for now.")
+        axis_grad = theano.gradient.grad_undefined(
+            self, 1, axis,
+            "argsort is not defined for non-integer axes so"
+            " argsort(x, axis+eps) is undefined")
+        return [inp_grad, axis_grad]
    """
    def R_op(self, inputs, eval_points):
        # R_op can receive None as eval_points.

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -185,7 +185,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
                subprocess_extra_args.update(dict(
                    stdout=dummy_out.fileno(),
                    stderr=dummy_out.fileno()))
+            t0 = time.time()
            subprocess.call(cmd, **subprocess_extra_args)
+            t1 = time.time()
            # Recover failed test indices from the 'failed' field of the
            # '.noseids' file. We need to do it after each batch because
            # otherwise this field may get erased. We use a set because it
@@ -193,8 +195,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
            # to avoid duplicates.
            failed = failed.union(cPickle.load(open(noseids_file, 'rb'))
                                  ['failed'])
-            print '%s%% done (failed: %s)' % ((test_range[-1] * 100) //
-                                n_tests, len(failed))
+            print '%s%% done in %.3fs (failed: %s)' % (
+                (test_range[-1] * 100) // n_tests, t1 - t0, len(failed))
        # Sort for cosmetic purpose only.
        failed = sorted(failed)
        if failed: