Merge.

b3ebee2a · Arnaud Bergeron · a6daeabb · 9bdeda96 · b3ebee2a · b3ebee2a
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -77,3 +77,7 @@ AddConfigVar('traceback.limit',
 AddConfigVar('warn.argmax_pushdown_bug',
             "Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010",
             BoolParam(True))
+
+AddConfigVar('warn.gpusum_01_011_0111_bug',
+             "Warn if we are in a case where old version of Theano had a silent bug with GpuSum pattern 01,011 and 0111 when the first dimensions was bigger then 4096. Was fixed 31 may 2010",
+             BoolParam(True))
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -824,6 +824,16 @@ class GpuSum(Op):
            threads_z = ''
        if len(self.reduce_mask)==3:
            threads_z = ''
+        if config.warn.gpusum_01_011_0111_bug:
+            pattern = '0'+N_pattern
+            warn = '''
+        static bool warn_gpusum_01_011_0111_bug = true;
+        if(warn_gpusum_01_011_0111_bug && CudaNdarray_HOST_DIMS(%(x)s)[%(N)s]>4096){
+            printf("WARNING: old version of Theano had a silent bug with GpuSum pattern %(pattern)s when the first dimensions was bigger then 4096. Was fixed 31 may 2010. To disable this warning set the Theano flags warn.gpusum_01_011_0111_bug to False. Won't repeat the warning before we exit.\\n");
+            warn_gpusum_01_011_0111_bug = false;
+        }
+'''%locals()
+        else: warn = ""
        print >> sio, """
        {
            int verbose = 0;
@@ -833,6 +843,7 @@ class GpuSum(Op):
            %(threads_y)s
            %(threads_z)s
            dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS));
+            %(warn)s
            %(makecall)s
        }
        """ %locals()
@@ -1037,6 +1048,7 @@ class GpuSum(Op):
        """ % locals()

    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
        print >> sio, """
        {
            int verbose = 0;
@@ -1044,13 +1056,11 @@ class GpuSum(Op):
                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
                            NUM_VECTOR_OP_THREADS_PER_BLOCK));

-            while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
-            n_threads.y -= 1;
+            while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
            if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2]) 
                n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2]; 

-            while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
-            n_threads.z -= 1;
+            while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
            if (n_threads.z > 64)
                n_threads.z = 64;
            if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) 
@@ -1058,41 +1068,12 @@ class GpuSum(Op):
            
            dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);

-            if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
-            if (verbose) fprint_CudaNdarray(stdout, %(x)s);
-            if (verbose) fprint_CudaNdarray(stdout, %(z)s);
-            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
-            kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
-                    CudaNdarray_HOST_DIMS(%(x)s)[1],
-                    CudaNdarray_HOST_DIMS(%(x)s)[2],
-                    CudaNdarray_HOST_DIMS(%(x)s)[3],
-                    CudaNdarray_DEV_DATA(%(x)s),
-                    CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                    CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                    CudaNdarray_HOST_STRIDES(%(x)s)[2],
-                    CudaNdarray_HOST_STRIDES(%(x)s)[3],
-                    CudaNdarray_DEV_DATA(%(z)s),
-                    CudaNdarray_HOST_STRIDES(%(z)s)[0]);
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts) 
-            {
-                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_1011_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
+            %(makecall)s
        }
        """ %locals()

    def c_code_cache_version(self):
-        return (14,)
+        return (17,)


    def c_support_code_apply(self, node, nodename):
@@ -1204,7 +1185,7 @@ class GpuSum(Op):
                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"

-            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
            param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)])
            param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)])
            decl = self._k_decl(node,nodename)
@@ -1212,16 +1193,19 @@ class GpuSum(Op):
            print >> sio, """
            %(decl)s{
                %(init)s
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+                  mysum = 0;
                  %(for_i1)s{
                    %(for_i2)s{
                      %(for_i3)s{
-                      float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + blockIdx.x * sA0];
+                        float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
                        mysum += Ai;
                      }
                    }
                  }
                  %(reducebuf)s
                }
+            }
            """ %locals()
        if self.reduce_mask == (1,0):
            # this kernel uses one block for each column, 

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -311,6 +311,7 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
        }

        int shp_el = PyInt_AsLong(shp_el_obj);
+        Py_DECREF(shp_el_obj);

        if (shp_el <= 0)
        {
@@ -320,7 +321,6 @@ PyObject* CudaNdarray_Zeros(PyObject* dummy, PyObject* shape)
        }

        newdims[i] = shp_el;
-        
        total_elements *= newdims[i];
    }

@@ -1395,7 +1395,10 @@ CudaNdarray_setitem(PyObject *o, PyObject  *key, PyObject  *v)
    assert (CudaNdarray_EqualAndIgnore(viewCopyForComparison, rval, 1, 1));
    assert (rval->base == baseSavedForComparison);
    assert (rval->dev_structure_fresh);
+
+    // Clean up locally-created references
    Py_DECREF((PyObject*)viewCopyForComparison);
+    Py_DECREF(rval);

    return 0;
 }

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -38,7 +38,27 @@ def test_sum():
                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
                           ((0,0,0,0),[0,1,2,3]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]:
+                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                           
+                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
+                           ((4100,3),[0]),((3,4101),[0]),#10
+                           
+                           ((4100,3),[1]),((3,4101),[1]),#01
+                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
+
+                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
+                           #((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ##not implemented
+                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+
+                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
+                           ]:
        a = tensor.TensorType('float32',(False,)*len(shape))()
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -149,6 +149,10 @@ class mrg_uniform_base(Op):
        return Apply(self, 
                [rstate, size], 
                [rstate.type(), self.output_type()])
+
+    def grad(self,inputs,ograd):
+        return [None for i in inputs]
+
    def c_code_cache_version(self):
        return (1,)

@@ -622,7 +626,12 @@ class MRG_RandomStreams(object):
        If the size argument is ambiguous on the number of dimensions,
        ndim may be a plain integer to supplement the missing
        information.
+        
+        Currently size can't be None. Otherwise it fail later. So I added the assert
        """
+        assert isinstance(size, tuple), "size must be a tuple"
+        assert all([isinstance(i,int) for i in size])
+        
        if nstreams is None:
            nstreams = self.n_streams(size)
        if self.use_cuda and dtype=='float32':
@@ -664,6 +673,8 @@ class MRG_RandomStreams(object):
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

+        assert isinstance(size, tuple), "size must be a tuple"
+        assert all([isinstance(i,int) for i in size])
        n_samples = numpy.prod(size)
        evened = False
           
@@ -710,5 +721,3 @@ def mrg_random_make_inplace(node):
        return new_op.make_node(*node.inputs).outputs
    return False
 optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
-
-
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -264,7 +264,7 @@ def test_consistency_GPU_parallel():

        # We need the sample back in the main memory
        cpu_sample = tensor.as_tensor_variable(sample)
-        f = theano.function([], cpu_sample)
+        f = theano.function([], cpu_sample, mode=mode)

        for k in range(n_samples):
            s = f()
@@ -351,6 +351,10 @@ def test_rng0():

 def test_normal0():

+    if config.mode == 'FAST_COMPILE':
+        mode = 'FAST_RUN'
+    else:
+        mode = config.mode
    def basictest(f, steps, target_avg, target_std, prefix=""):
        dt = 0.0
        avg_std = 0.0
@@ -414,7 +418,7 @@ def test_normal0():
    RR = theano.tensor.shared_randomstreams.RandomStreams(234)

    nn = RR.normal(size=sample_size, avg=-5.0, std=2.0)
-    ff = theano.function([], nn, mode=mode)
+    ff = theano.function([], nn)

    basictest(ff, 50, -5.0, 2.0, prefix='numpy ')


--- a/theano/tensor/nnet/tests/test_sigm.py
+++ b/theano/tensor/nnet/tests/test_sigm.py
@@ -87,7 +87,7 @@ class T_sigmoid_opts(unittest.TestCase):
 class T_softplus_opts(unittest.TestCase):
    def setUp(self):
        if theano.config.mode == 'FAST_COMPILE':
-            m = theano.compile.mode.get_mode('FAST_RUN')
+            m = theano.compile.mode.get_mode('FAST_RUN').excluding('local_elemwise_fusion')
        else:
            m = theano.compile.mode.get_default_mode().excluding('local_elemwise_fusion')
        self.m = m