Merge pull request #2314 from nouiz/mixed

Fix test, compile less, doc, add tests.

Merge pull request #2314 from nouiz/mixed
8deed652 · abergeron · dce45cf8 · 96a7b3cb · 8deed652 · 8deed652
--- a/doc/extending/cop.txt
+++ b/doc/extending/cop.txt
@@ -184,6 +184,20 @@ There are less methods to define for an Op than for a Type:
       Overrides :meth:`c_code_cache_version` if defined, but
       otherwise has the same contract.
+    .. method:: python_constant_folding(node)
+       Optional. If present this method will be called before doing
+       constant folding of a node, with that node as a parameter. If
+       it return True, we will not generate c code when doing constant
+       folding of this node.  This is useful when the compilation of
+       the c code will be longer then the computation in python
+       (e.g. Elemwise of scalars).
+       In addition, this allow to lower the number of compiled module
+       and disk access. Particularly useful when the file system load
+       is high or when theano compilation directory is shared by many
+       process (like on a network file server on a cluster).
 The ``name`` argument is currently given an invalid value, so steer
 away from it. As was the case with Type, ``sub['fail']`` provides
 failure code that you *must* use if you want to raise an exception,

--- a/doc/library/compile/function.txt
+++ b/doc/library/compile/function.txt
@@ -187,3 +187,5 @@ Reference
    Replacements specified with
    givens are different from optimizations in that Var2 is not expected to be
    equivalent to Var1.
+.. autofunction:: theano.compile.function.function_dump
--- a/doc/library/index.txt
+++ b/doc/library/index.txt
@@ -36,6 +36,11 @@ There are also some top-level imports that you might find more convenient:
    Alias for :func:`function.function`
+.. function:: function_dump(...)
+    Alias for :func:`theano.compile.function.function_dump`
 .. function:: shared(...)
    Alias for :func:`shared.shared`

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -474,6 +474,7 @@ TensorVariable
            * (2, 0, 1) -> AxBxC to CxAxB
            * (0, 'x', 1) -> AxB to Ax1xB
            * (1, 'x', 0) -> AxB to Bx1xA
+            * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
    .. method:: flatten(ndim=1)

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -25,7 +25,7 @@
 .. note::
    As of October 21st, 2014, the default GPU image convolution
-    changed: By default, if :ref:`cuDNN <_libdoc_cuda_dnn>`
+    changed: By default, if :ref:`cuDNN <libdoc_cuda_dnn>`
    is available, we will use it, otherwise we will fall back to using the
    gemm version (slower then cuDNN in most cases, uses more memory, but
    faster than the legacy version we used before).

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -129,6 +129,22 @@ def test_careduce():
                               ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
                               ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
                               ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+                               # reduce over 2d
+                               ((4100,4,3,2),[1,2]),((4,4100,3,2),[1,2]),((4,3,4100,2),[1,2]),((4,3,2,4100),[1,2]),#0110
+#                               ((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 need 101
+#                               ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
+                               ((4100,4,3,2),[0,1]),((4,4100,3,2),[0,1]),((4,3,4100,2),[0,1]),((4,3,2,4100),[0,1]),#1100
+                               # reduce over 3d
+                               # 3d not tested: 1101, 1110, 1111
+#                               ((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 need 101
+                               ((4100,4,3,2),[0,1,2]),((4,4100,3,2),[0,1,2]),((4,3,4100,2),[0,1,2]),((4,3,2,4100),[0,1,2]),#1110
+                               # reduce over 4d
+                               ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1111
+                               # reduce over 5d
                               ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
                               ]:

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -907,24 +907,37 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+    if direction == 'fprop':
        i = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in npy_img.shape])()
        k = cuda.CudaNdarrayType(
            broadcastable=[sh == 1 for sh in npy_kern.shape])()
-    if direction == 'fprop':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
                                                subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
    elif direction == 'bprop img':
+        i = cuda.CudaNdarrayType(
+            broadcastable=[sh == 1 for sh in
+                           npy_kern.transpose(1, 0, 2, 3).shape])()
+        k = cuda.CudaNdarrayType(
+            broadcastable=[sh == 1 for sh in npy_img.shape])()
        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
    elif direction == 'bprop kern':
+        i = cuda.CudaNdarrayType(
+            broadcastable=[sh == 1 for sh in
+                           npy_img.transpose(1, 0, 2, 3).shape])()
+        k = cuda.CudaNdarrayType(
+            broadcastable=[sh == 1 for sh in
+                           npy_kern.transpose(1, 0, 2, 3).shape])()
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
            border_mode='valid', subsample=subsample)(i, k)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1193,6 +1193,13 @@ class Elemwise(OpenMPOp):
        else:
            return ()
+    def python_constant_folding(self, node):
+        """
+        Return True if we do not want to compile c code
+        when doing constant folding of this node.
+        """
+        return node.outputs[0].ndim == 0
 # def elemwise_to_scal(fgraph):
 # TODO: why is this commented out? should it be removed?
 #       it has needed maintenance despite being commented

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1605,7 +1605,7 @@ compile.optdb['specialize'].register('local_remove_all_assert',
                                     local_remove_all_assert,
                                     use_db_name_as_tag=False)
-@register_specialize
+@register_specialize("local_alloc_elemwise")
 @gof.local_optimizer([T.Elemwise])
 def local_elemwise_alloc(node):
    """
@@ -4508,7 +4508,19 @@ def constant_folding(node):
    for o in node.outputs:
        storage_map[o] = [None]
        compute_map[o] = [False]
+    if (hasattr(node.op, 'python_constant_folding') and
+        node.op.python_constant_folding(node)):
+        old_value = getattr(node.op, '_op_use_c_code', False)
+        try:
+            node.op._op_use_c_code = False
+            thunk = node.op.make_thunk(node,
+                                       storage_map,
+                                       compute_map,
+                                       [])
+        finally:
+            node.op._op_use_c_code = old_value
+    else:
        thunk = node.op.make_thunk(node, storage_map, compute_map,
                                   no_recycling=[])

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -3671,6 +3671,17 @@ def test_constant_folding():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
+    # Test that we do not crash when constant folding elemwise scalar
+    # as they should not generate c code.
+    x = tensor.constant(3)
+    assert x.ndim == 0
+    mode = theano.compile.get_mode("FAST_COMPILE").excluding("fusion")
+    f = theano.function([], [x * 2, x + x], mode=mode)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 2
+    assert all([isinstance(n.op, DeepCopyOp) for n in topo])
 def test_constant_get_stabilized():
    """