Merge pull request #2153 from nouiz/cuda_tests

[BUILDBOT CRASH] Fix test in FAST_COMPILE.

Merge pull request #2153 from nouiz/cuda_tests
9da67d20 · abergeron · 7d286d04 · 86814d96 · 9da67d20 · 9da67d20
--- a/doc/tutorial/profiling.txt
+++ b/doc/tutorial/profiling.txt
@@ -65,6 +65,20 @@ Here is an example output when we disable some Theano optimizations to
 give you a better idea of the difference between sections. With all
 optimizations enabled, there would be only one op left in the graph.
+.. note::
+    To profile the peak memory usage on the GPU you need to do::
+        * In the file theano/sandbox/cuda/cuda_ndarray.cu, set the macro
+          COMPUTE_GPU_MEM_USED to 1.
+        * Then call theano.sandbox.cuda.theano_allocated()
+          It return a tuple with two ints. The first is the current GPU
+          memory allocated by Theano. The second is the peak  GPU memory
+          that was allocated by Theano.
+    Do not always enable this, as this slowdown memory allocation and
+    free. As this slowdown the computation, this will affect speed
+    profiling. So don't use both at the same time.
 to run the example:

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -563,7 +563,9 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
 def test_valid():
-    for t in _test_valid(None, version=[-2, -1, 6]):
+    for t in _test_valid(None,
+                         mode=theano_mode,
+                         version=[-2, -1, 6]):
        yield t
@@ -648,7 +650,9 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
 def test_full():
-    for t in _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5]):
+    for t in _test_full(None,
+                        mode=theano_mode,
+                        version=[-2, -1, 0, 1, 2, 3, 4, 5]):
        yield t

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -551,7 +551,8 @@ def conv3D(V, W, b, d):
           This is for optimization.
    :note: The GPU implementation is very slow. You should use
-           :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` for a
+           :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` or
+           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` for a
           GPU graph instead.
    :see: Someone made a script that shows how to swap the axes

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -176,7 +176,8 @@ def conv3d(signals, filters,
    :note: Another way to define signals: (batch,  time, in channel, row, column)
           Another way to define filters: (out channel,time,in channel, row, column)
-    :note: See the `conv3d_fft`_ or `conv3d2d`_ for GPU implementations.
+    :note: For the GPU, you can use this implementation or
+           :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`.
    :see: Someone made a script that shows how to swap the axes between
          both 3d convolution implementations in Theano. See the last

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -339,7 +339,7 @@ def register_specialize(lopt, *tags, **kwargs):
    else:
        name = (kwargs and kwargs.pop('name')) or lopt.__name__
        compile.optdb['specialize'].register(name, lopt, 'fast_run',
-                                             'fast_compile_gpu', *tags)
+                                             *tags)
        return lopt

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -6430,7 +6430,8 @@ class TestInferShape(utt.InferShapeTester):
        self._compile_and_check([advec, bdvec],
                                [Dot()(advec, bdvec)],
                                [advec_val, bdvec_val],
-                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Dot22,
+                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
        #mat/mat
        admat = dmatrix()
@@ -6447,14 +6448,16 @@ class TestInferShape(utt.InferShapeTester):
        self._compile_and_check([advec, bdmat],
                                [Dot()(advec, bdmat)],
                                [advec_val, bdmat_val],
-                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Dot22,
+                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
        #mat/vec
        admat_val = rand(5, 4)
        self._compile_and_check([admat, bdvec],
                                [Dot()(admat, bdvec)],
                                [admat_val, bdvec_val],
-                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
+                                (Dot, tensor.blas.Dot22,
+                                 tensor.blas.Gemv, tensor.blas_c.CGemv))
        # Split
        aivec = ivector()