Merge pull request #1761 from nouiz/mixed

Mixed

Merge pull request #1761 from nouiz/mixed
4927f127 · abergeron · 1973d9b6 · a889f77c · 4927f127 · 4927f127
--- a/theano/gof/compilelock.py
+++ b/theano/gof/compilelock.py
@@ -165,8 +165,12 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
    my_pid = os.getpid()
    no_display = (verbosity == 0)

-    # Acquire lock.
    nb_error = 0
+    # The number of time we sleep when their is no errors.
+    # Used to don't display it the first time to display it less frequently.
+    # And so don't get as much email about this!
+    nb_wait = 0
+    # Acquire lock.
    while True:
        try:
            last_owner = 'no_owner'
@@ -214,7 +218,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                    last_owner = read_owner
                    time_start = time.time()
                    no_display = (verbosity == 0)
-                if not no_display:
+                if not no_display and nb_wait > 0:
                    if read_owner == 'failure':
                        msg = 'unknown process'
                    else:
@@ -225,6 +229,7 @@ def lock(tmp_dir, timeout=120, min_wait=5, max_wait=10, verbosity=1):
                                 tmp_dir)
                    if verbosity <= 1:
                        no_display = True
+                nb_wait += 1
                time.sleep(random.uniform(min_wait, max_wait))

            try:

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1198,7 +1198,11 @@ class GpuCAReduce(GpuOp):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+
+        """ % locals()

        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1509,6 +1513,8 @@ class GpuCAReduce(GpuOp):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);

            dim3 n_blocks(1,1,1);
            %(makecall)s

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -109,11 +109,13 @@ def test_careduce():
                               ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                               #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                               ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+                               ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111

                               ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                               ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                               ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                               ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                               ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
                               ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111



--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -1281,7 +1281,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    n_threads.z += 1;
                else
                    break;
-            }""" % locals()
+            }
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+        """ % locals()

        if len(self.reduce_mask) == 2:
            threads_y = ''
@@ -1601,6 +1604,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_threads.z += 1;
            }
            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);

            dim3 n_blocks(1,1,1);
            %(makecall)s

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
-import unittest
-
 from theano import scalar, gof
-from theano.gof import FunctionGraph
 from theano.gof.python25 import all, any
-from theano.tests.unittest_tools import SkipTest

 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
                                               test_CAReduce)
@@ -126,11 +122,13 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((65,4,3),[0,1,2]),((5,65,3),[0,1,2]),((5,4,65),[0,1,2]),#111

             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111

             #test pattern implemented by reshape

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -26,7 +26,9 @@ if cuda_available:
    from theano.sandbox.cuda import (CudaNdarrayType,
                                     float32_shared_constructor)

+
 def matVecModM(A, s, m):
+    assert A.dtype == 'int64'
    return numpy.int32(numpy.sum((A*s) % m, 1) % m)


@@ -53,24 +55,30 @@ MASK2 = numpy.int32(65535)      #2^16 - 1
 MULT2 = numpy.int32(21069)
 NORM = 4.656612873077392578125e-10; #1./2^31

-A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
-A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
+#A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]],
+#                      dtype='int64')
+#A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]],
+#                      dtype='int64')

 A1p72 = numpy.asarray([[1516919229, 758510237, 499121365],
                       [1884998244, 1516919229, 335398200],
-                       [601897748, 1884998244, 358115744]])
+                       [601897748, 1884998244, 358115744]],
+                      dtype='int64')
 A2p72 = numpy.asarray([[1228857673, 1496414766, 954677935],
                       [1133297478, 1407477216, 1496414766],
-                       [2002613992, 1639496704, 1407477216]])
+                       [2002613992, 1639496704, 1407477216]],
+                      dtype='int64')

 A1p134 = numpy.asarray(
    [[1702500920, 1849582496, 1656874625],
     [828554832, 1702500920, 1512419905],
-     [1143731069, 828554832, 102237247]])
+     [1143731069, 828554832, 102237247]],
+    dtype='int64')
 A2p134 = numpy.asarray(
    [[796789021, 1464208080, 607337906],
     [1241679051, 1431130166, 1464208080],
-     [1401213391, 1178684362, 1431130166]])
+     [1401213391, 1178684362, 1431130166]],
+    dtype='int64')
 np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]



--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -164,7 +164,8 @@ class TensorType(Type):
                            " Theano C code does not support that.",
                            msg,
                            "object shape", data.shape,
-                            "object strides", data.strides)
+                            "object strides", data.strides,
+                            "object dtype", data.dtype)

        i = 0
        for b in self.broadcastable: