Merge branch 'master' into take-op-c-code-clean

3dc94e90 · abalkin · fa5e7d12 · 1dcc6180 · 3dc94e90 · 3dc94e90
--- a/bin/theano-nose
+++ b/bin/theano-nose
@@ -87,6 +87,7 @@ def main():
    if time_prof_args or batch_args:
        from theano.tests import run_tests_in_batch
        return run_tests_in_batch.main(
+                theano_nose=os.path.realpath(__file__),
                batch_size=batch_size,
                time_profile=bool(time_prof_args),
                display_batch_output=display_batch_output)

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1609,7 +1609,11 @@ class _Linker(gof.link.LocalLinker):
        active_order = self.schedule(fgraph) # an ordering of just the active nodes
        active_order_set = set(active_order)
-        no_recycling = self.no_recycling
+        # Disable no_recycling, in order to be able to use
+        # check_preallocated_output even on the output of the function.
+        # no_recycling in individual thunks does not really matter, since
+        # the function's outputs will always be freshly allocated.
+        no_recycling = []
        input_storage, output_storage, storage_map = link.map_storage(
            fgraph, order, input_storage_, output_storage_)
@@ -1704,11 +1708,14 @@ class _Linker(gof.link.LocalLinker):
                    _logger.warn("We won't check the perform function of node '%s' but we will check its make_thunk function" % node)
                    thunks_py[-1] = thunk
-        if no_recycling is True:
+        # Use self.no_recycling (that was passed in accept()) to always
-            no_recycling = storage_map.values()
+        # use new memory storage when it is needed, in particular for the
-            no_recycling = utils.difference(no_recycling, input_storage)
+        # function's outputs. no_recycling_map will be used in f() below.
+        if self.no_recycling is True:
+            no_recycling_map = storage_map.values()
+            no_recycling_map = utils.difference(no_recycling_map, input_storage)
        else:
-            no_recycling = [storage_map[r] for r in no_recycling
+            no_recycling_map = [storage_map[r] for r in self.no_recycling
                            if r not in fgraph.inputs]
        # Precompute some things for storage pre-allocation
@@ -1729,7 +1736,7 @@ class _Linker(gof.link.LocalLinker):
            _logger.debug("starting a DebugMode call")
            _logger.debug("self.maker.mode.check_preallocated_output: %s",
                    self.maker.mode.check_preallocated_output)
-            for x in no_recycling:
+            for x in no_recycling_map:
                x[0] = None
            # nest all this in try-finally to put storage *back* into

--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -709,7 +709,7 @@ class Test_preallocated_output(unittest.TestCase):
        a = theano.tensor.fmatrix('a')
        b = theano.tensor.fmatrix('b')
        z = BrokenCImplementationAdd()(a, b)
-        # Needed so that z is not the output of the graph
+        # In this test, we do not want z to be an output of the graph.
        out = theano.tensor.dot(z, numpy.eye(7))
        a_val = self.rng.randn(7, 7).astype('float32')
@@ -730,7 +730,39 @@ class Test_preallocated_output(unittest.TestCase):
                check_preallocated_output=['f_contiguous'])
        f = theano.function([a, b], out, mode=mode)
+        if theano.config.cxx:
+            self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
+        else:
+            # The python code of this op is good.
+            f(a_val, b_val)
+    def test_f_contiguous_out(self):
+        # Same test as test_f_contiguous, but check that it works
+        # even if z _is_ the output of the graph
+        a = theano.tensor.fmatrix('a')
+        b = theano.tensor.fmatrix('b')
+        out = BrokenCImplementationAdd()(a, b)
+        a_val = self.rng.randn(7, 7).astype('float32')
+        b_val = self.rng.randn(7, 7).astype('float32')
+        # Should work
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['c_contiguous'])
+        f = theano.function([a, b], out, mode=mode)
+        out_val = f(a_val, b_val)
+        #print 'out_val =', out_val
+        #print out_val.strides
+        # Should raise an Exception, since the output buffer is
+        # used incorrectly.
+        mode = debugmode.DebugMode(
+                check_preallocated_output=['f_contiguous'])
+        f = theano.function([a, b], out, mode=mode)
        if theano.config.cxx:
            self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
        else:

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -298,7 +298,7 @@ AddConfigVar('warn.ignore_bug_before',
              "bugs found after that version. "
              "Warning for specific bugs can be configured with specific "
              "[warn] flags."),
-             EnumStr('None', 'all', '0.3', '0.4', '0.4.1', '0.5', '0.6',
+             EnumStr('0.5', 'None', 'all', '0.3', '0.4', '0.4.1', '0.6',
                     allow_override=False),
             in_c_key=False)

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -891,10 +891,11 @@ class ModuleCache(object):
            hash_key = hash(key)
            key_data = None
            # We have never seen this key before.
-            # Acquire lock before creating things in the compile cache,
-            # to avoid that other processes remove the compile dir while it
+            # We acquire the lock later only if we where able to
-            # is still empty.
+            # generate c code Otherwise, we would take the lock for op
-            compilelock.get_lock()
+            # that have only a perform().
+            lock_taken = False
            # This try/finally block ensures that the lock is released once we
            # are done writing in the cache file or after raising an exception.
            try:
@@ -918,6 +919,13 @@ class ModuleCache(object):
                    # The first compilation step is to yield the source code.
                    src_code = compile_steps.next()
                    module_hash = get_module_hash(src_code, key)
+                    # The op have c_code, so take the lock.
+                    compilelock.get_lock()
+                    lock_taken = True
+                    assert os.path.exists(location), (
+                        "The directory just created shouldn't be deleted!")
                    if module_hash in self.module_hash_to_key_data:
                        _logger.debug("Duplicated module! Will re-use the "
                                "previous one")
@@ -1039,7 +1047,7 @@ class ModuleCache(object):
            finally:
                # Release lock if needed.
-                if not keep_lock:
+                if not keep_lock and lock_taken:
                    compilelock.release_lock()
            # Update map from key to module name for all keys associated to

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
-        if (%(memset_0)s)
+        if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
        {
            if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
                                          CudaNdarray_SIZE(%(out)s) * 4))
@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp):
        return [None for i in inputs]
    def c_code_cache_version(self):
-        return (5,)
+        return (7,)
    def do_constant_folding(self, node):
        for client in node.outputs[0].clients:
@@ -2803,6 +2803,13 @@ class GpuContiguous(GpuOp):
    def __hash__(self):
        return hash(type(self))
+    def grad(self, inputs, dout):
+        x, = inputs
+        dout, = dout
+        return [dout]
    def __str__(self):
        return self.__class__.__name__
@@ -2824,7 +2831,8 @@ class GpuContiguous(GpuOp):
            } else if ((NULL == %(z)s)""" % locals()
        for i in xrange(len(node.inputs[0].type.broadcastable)):
            str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
-        str += """)
+        str += """
+                || !CudaNdarray_is_c_contiguous(%(z)s))
            {
                Py_XDECREF(%(z)s);
                %(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
@@ -2840,7 +2848,7 @@ class GpuContiguous(GpuOp):
        return str
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
 gpu_contiguous = GpuContiguous()

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -748,7 +748,7 @@ class GpuDownsampleFactorMax(GpuOp):
    #def perform(self, node, input_storage, output_storage):
        #raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
-        return (5)
+        return (6)
    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
@@ -849,6 +849,9 @@ class GpuDownsampleFactorMax(GpuOp):
           float *z, int zS0, int zS1, int zS2, int zS3)
        {
            float cur_max, cur_x;
+            // Cast threadIdx.x into a signed int, to avoid problems with
+            // indexing with negative offsets.
+            int tx = threadIdx.x;
            for(int block_x_idx = blockIdx.x;
                block_x_idx < D0 * D1;
                block_x_idx += gridDim.x){
@@ -865,7 +868,7 @@ class GpuDownsampleFactorMax(GpuOp):
                {
                    __syncthreads();
                    // load the current row of the image into shared memory
-                    for (int j = threadIdx.x; j < xD3; j += blockDim.x)
+                    for (int j = tx; j < xD3; j += blockDim.x)
                    {
                        xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3];
                    }
@@ -873,7 +876,7 @@ class GpuDownsampleFactorMax(GpuOp):
                    // initialize our max if this is the
                    // first row we're loading
-                    cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
+                    cur_max = (r2 == 0) ? xbuf[tx*pf3] : cur_max;
                    // do a mini-reduction over the pf3 relevant elements
                    // in the current row
@@ -882,7 +885,7 @@ class GpuDownsampleFactorMax(GpuOp):
                    {
                        for (int k = 0; k < pf3; ++k)
                        {
-                            cur_x = xbuf[threadIdx.x*pf3+k];
+                            cur_x = xbuf[tx*pf3+k];
                            cur_max = (cur_x > cur_max) ? cur_x : cur_max;
                        }
                    }
@@ -890,17 +893,16 @@ class GpuDownsampleFactorMax(GpuOp):
                    {
                        for (int k = 0; k < pf3; ++k)
                        {
-                            if (threadIdx.x*pf3 + k < xD3)
+                            if (tx*pf3 + k < xD3)
                            {
-                                cur_x = xbuf[threadIdx.x*pf3+k];
+                                cur_x = xbuf[tx*pf3+k];
                                cur_max = (cur_x > cur_max) ? cur_x : cur_max;
                            }
                        }
                    }
                }
-                //store the result to global memory
+                z[i0*zS0 + i1*zS1 + i2*zS2 + tx*zS3] = cur_max;
-                z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
            }
        }
        """ % locals()
@@ -931,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
        return Apply(self, [x, z, gz], [x.type()])
    def c_code_cache_version(self):
-        return (6,)
+        return (7,)
    def c_code(self, node, nodename, inp, out, sub):
        x, z, gz = inp
@@ -999,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
                CudaNdarray_HOST_STRIDES(%(gz)s)[1],
                CudaNdarray_HOST_STRIDES(%(gz)s)[2],
                CudaNdarray_HOST_STRIDES(%(gz)s)[3],
-                CudaNdarray_DEV_DATA(%(gx)s));
+                CudaNdarray_DEV_DATA(%(gx)s),
+                CudaNdarray_HOST_STRIDES(%(gx)s)[0],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[1],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[2],
+                CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
            if( cudaSuccess != err)
@@ -1037,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
           const float * x, int xS0, int xS1, int xS2, int xS3,
           const float * z, int zS0, int zS1, int zS2, int zS3,
           const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
-           float *gx)
+           float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
        {
            //  D0: number of image rows
            //  D1: number of image cols
@@ -1048,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
            // various .S. variables are strides
            float cur_max, cur_x, my_z, my_gz;
+            // Cast threadIdx.x into a signed int, to avoid problems with
+            // indexing with negative offsets.
+            int tx = threadIdx.x;
            for(int i0 = blockIdx.x;
                i0 < D0;
                i0 += gridDim.x){
@@ -1056,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
                // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
                // (as needed to cover all x rows)
                int i2 = blockIdx.y;
-                int x_col = threadIdx.x;   // col wrt x, ranges from 0 to xD3 - 1
+                int x_col = tx;            // col wrt x, ranges from 0 to xD3 - 1
                int z_col = x_col/ds1;     // z_col corresponding to this x_col
@@ -1073,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
                        if(blockDim.x != xD3)
                        {
-                            x_col = threadIdx.x + col_iter * blockDim.x;
+                            x_col = tx + col_iter * blockDim.x;
                            z_col = x_col/ds1;
                        }
@@ -1108,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
                                // gx[image_row][image_col][x_row][x_col]
                                //   = (my_z == x[image_row][image_col][
                                //                x_row][x_col]) ? my_gz : 0.0f;
-                                gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
+                                gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3]
-                                   x_row*xD3 + x_col]
                                   = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
                                                x_col*xS3]) ? my_gz : 0.0f;
                            }
-                        //gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
-                        //   x_row*xD3 + x_col] = -999;
                        }
                    }

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -3241,14 +3241,19 @@ static __global__ void k_copy_4d(const int N1,
    // These must be made int instead of unsigned int due to a bug in nvcc
    int bx = blockIdx.x;
    int by = blockIdx.y;
-    // N1 and N2 are kept in case a future implementation needs to
-    // loop on the first two dimensions if there are not enough blocks
+    for (int i = bx; i < N1; i += gridDim.x)
-    for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
    {
-        for (int i = threadIdx.x; i <  N3; i += (int) blockDim.x)
+        for (int j = by; j < N2; j += gridDim.y)
        {
-            y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] =  
+            for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
-                x[bx * sx1 + by * sx2 + i * sx3 + j * sx4];
+            {
+                for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
+                {
+                    y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
+                        x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
+                }
+            }
        }
    }
 }
@@ -3380,8 +3385,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
                // The blocks implement the looping over the first two axes so
                // this needs to be (N1, N2)
-                dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0],
+                dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
-                               (unsigned int) CudaNdarray_HOST_DIMS(self)[1]);
+                                        NUM_VECTOR_OP_BLOCKS),
+                               std::min(CudaNdarray_HOST_DIMS(self)[1],
+                                        NUM_VECTOR_OP_BLOCKS));
                // For the threads, just make as many as possible
                dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
                                 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -12,8 +12,10 @@
 #else
 #define DllExport   __declspec( dllimport )
 #endif
-#else
+#define ALWAYS_INLINE
+#else //else _WIN32
 #define DllExport
+#define ALWAYS_INLINE __attribute__((always_inline))
 #endif
 typedef float real;
@@ -134,7 +136,7 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
 DllExport const int *
 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
-DllExport inline void __attribute__((always_inline))
+DllExport inline void ALWAYS_INLINE
 cnda_mark_dev_structure_dirty(CudaNdarray * self)
 {
    self->dev_structure_fresh = 0;
@@ -155,7 +157,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
 *
 *  Does not sync structure to device.
 */
-DllExport inline void __attribute__((always_inline))
+DllExport inline void ALWAYS_INLINE
 CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) 
 {
    if ((idx >= self->nd) || (idx < 0) || (d < 0))
@@ -173,7 +175,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
 }
-DllExport inline void __attribute__((always_inline))
+DllExport inline void ALWAYS_INLINE
 CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
 {
    if ((idx >= self->nd) || (idx < 0))
@@ -232,7 +234,7 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
 * Note: This does not allocate storage for data, or free
 *       pre-existing storage.
 */
-DllExport inline int __attribute__((always_inline))
+DllExport inline int ALWAYS_INLINE
 CudaNdarray_set_nd(CudaNdarray * self, const int nd)
 {
    if (nd != self->nd)
@@ -434,7 +436,7 @@ CudaNdarray_ZEROS(int n, int * dims);
 /**
 * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
 */
-DllExport inline bool  __attribute__((always_inline))
+DllExport inline bool ALWAYS_INLINE
 CudaNdarray_is_c_contiguous(const CudaNdarray * self)
 {
    bool c_contiguous = true;

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -445,14 +445,14 @@ theano.compile.register_deep_copy_op_c_code(
                %(fail)s;
            }
        } else {
-            if(!CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) {
+            if(CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) {
                PyErr_SetString(PyExc_ValueError,
            "DeepCopyOp: the copy failed into already allocated space!");
                %(fail)s;
            }
        }
        """,
-        version=2)
+        version=3)
 # THIS WORKS But CudaNdarray instances don't compare equal to one

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -128,7 +128,7 @@ class MultinomialFromUniform(Op):
        if unis.shape[0] != pvals.shape[0]:
            raise ValueError("unis.shape[0] != pvals.shape[0]",
                             unis.shape[0], pvals.shape[0])
-        if not z[0] or z[0].shape != pvals.shape:
+        if z[0] is None or z[0].shape != pvals.shape:
            z[0] = numpy.zeros(pvals.shape, dtype=node.outputs[0].dtype)
        nb_multi = pvals.shape[0]

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -696,7 +696,7 @@ def test_random_state_transfer():
    """
    Test that random state can be transferred from one theano graph to another.
    """
-    class Graph():
+    class Graph:
        def __init__(self, seed=123):
            self.rng = MRG_RandomStreams(seed)
            self.y = self.rng.uniform(size=(1,))

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -3331,8 +3331,8 @@ class T_Scan(unittest.TestCase):
                           outputs_info=[tensor.zeros_like(A)])
        f = theano.function([A,B], S.owner.inputs[0][-1])
        rng = numpy.random.RandomState(utt.fetch_seed())
-        vA = rng.uniform(size=(5,5))
+        vA = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
-        vB = rng.uniform(size=(5,5))
+        vB = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
        assert numpy.allclose(f(vA, vB), numpy.dot(vA.T, vB))

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1615,7 +1615,7 @@ def local_gemm_to_ger(node):
            yv = y.dimshuffle(1)
            try:
                bval = T.get_scalar_constant_value(b)
-            except TypeError:
+            except T.NotScalarConstantError:
                # b isn't a constant, GEMM is doing useful pre-scaling
                return

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -965,7 +965,7 @@ class ConvOp(OpenMPOp):
        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>']
    def c_code_cache_version(self):
-        return (9, self.openmp)
+        return (10, self.openmp)
    def c_support_code(self):
        return """
@@ -1343,14 +1343,24 @@ if (typenum != typenum_f) {
  %(fail)s;
 }
-if (!img2d) %(fail)s;
+if (!img2d)
-if (!filtersflipped) %(fail)s;
+{
+    PyErr_SetString(PyExc_AssertionError, "!img2d");
+    %(fail)s;
+}
+if (!filtersflipped)
+{
+    PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
+    %(fail)s;
+}
 if ((!%(z)s)
  || *PyArray_DIMS(%(z)s)!=4
  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  || (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
+  ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
+  ||!PyArray_ISCONTIGUOUS(%(z)s)
  )
 {
  {Py_XDECREF(%(z)s);}
@@ -1370,19 +1380,11 @@ Os[0]=%(self_outshp0)s;
 Os[1]=%(self_outshp1)s;
 //assertions
-if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] *
+if (!PyArray_ISCONTIGUOUS(%(z)s))
-                         PyArray_DIMS(%(z)s)[2] *
+{
-                         PyArray_DIMS(%(z)s)[3] *
+    PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
-                         (npy_intp)sizeof(%(type)s))
-    %(fail)s;
-if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] *
-                         PyArray_DIMS(%(z)s)[3] *
-                         (npy_intp)sizeof(%(type)s))
-    %(fail)s;
-if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s))
-    %(fail)s;
-if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s))
    %(fail)s;
+}
 for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
@@ -1862,14 +1864,24 @@ typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
 if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
 if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
-if (!img2d) %(fail)s;
+if (!img2d)
-if (!filtersflipped) %(fail)s;
+{
+    PyErr_SetString(PyExc_AssertionError, "!img2d");
+    %(fail)s;
+}
+if (!filtersflipped)
+{
+    PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
+    %(fail)s;
+}
 if ((!%(z)s)
  || *PyArray_DIMS(%(z)s)!=4
  ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
  ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
  ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
-  || (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
+  ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
+  ||!PyArray_ISCONTIGUOUS(%(z)s)
  )
 {
  {Py_XDECREF(%(z)s);}
@@ -1889,10 +1901,11 @@ Os[0]=%(self_outshp0)s;
 Os[1]=%(self_outshp1)s;
 //assertions
-if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] *PyArray_DIMS(%(z)s)[2] *PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (!PyArray_ISCONTIGUOUS(%(z)s))
-if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] * PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+{
-if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
-if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
+    %(fail)s;
+}
 for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1185,6 +1185,9 @@ def local_subtensor_make_vector(node):
                # if it is a constant we can do something with it
                try:
                    v = get_scalar_constant_value(idx)
+                    if isinstance(v, numpy.integer):
+                        # Python 2.4 wants to index only with Python integers
+                        v = int(v)
                    return [x.owner.inputs[v]]
                except NotScalarConstantError:
                    pass

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -417,8 +417,8 @@ def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
 def rand(*shape):
-    r = numpy.asarray(numpy.random.rand(*shape), dtype=config.floatX)
+    r = numpy.random.rand(*shape) * 2 - 1
-    return r * 2 - 1
+    return numpy.asarray(r, dtype=config.floatX)
 def rand_nonzero(shape, eps=3e-4):
@@ -4270,8 +4270,9 @@ class t_dot(unittest.TestCase):
            return type(x), x.dtype, x.shape
        nz = numpy.dot(x, y)
        tz = eval_outputs([dot(as_tensor_variable(x), as_tensor_variable(y))])
-        self.assertTrue(tz.dtype == nz.dtype)
+        self.assertTrue(tz.dtype == nz.dtype,
-        self.assertTrue(tz.shape == nz.shape)
+                (tz.dtype, tz.dtype.num, nz.dtype, nz.dtype.num))
+        self.assertTrue(tz.shape == nz.shape, (tz.shape, nz.shape))
        self.assertTrue(_approx_eq(nz, tz))
    def test_Op_dims(self):
@@ -4300,19 +4301,19 @@ class t_dot(unittest.TestCase):
        self.assertRaises(TypeError, _dot, d3, d3)
    def test_dot_0d_0d(self):
-        self.cmp_dot(1.1, 2.2)
+        self.cmp_dot(rand(), rand())
    def test_dot_0d_1d(self):
-        self.cmp_dot(1.1, rand(5))
+        self.cmp_dot(rand(), rand(5))
    def test_dot_0d_2d(self):
-        self.cmp_dot(3.0, rand(6,7))
+        self.cmp_dot(rand(), rand(6,7))
    def test_dot_0d_3d(self):
-        self.cmp_dot(3.0, rand(8,6,7))
+        self.cmp_dot(rand(), rand(8,6,7))
    def test_dot_1d_0d(self):
-        self.cmp_dot(rand(5), 1.1 )
+        self.cmp_dot(rand(5), rand())
    def test_dot_1d_1d(self):
        self.cmp_dot(rand(5), rand(5))
@@ -4344,7 +4345,7 @@ class t_dot(unittest.TestCase):
        self.cmp_dot(rand(6), rand(8,6,7))
    def test_dot_2d_0d(self):
-        self.cmp_dot(rand(5,6), 1.0)
+        self.cmp_dot(rand(5,6), rand())
    def test_dot_2d_1d(self):
        self.cmp_dot(rand(5, 6), rand(6))
@@ -4380,7 +4381,7 @@ class t_dot(unittest.TestCase):
        self.cmp_dot(rand(5,6), rand(8,6,7))
    def test_dot_3d_0d(self):
-        self.cmp_dot(rand(4,5,6), 1.0)
+        self.cmp_dot(rand(4,5,6), rand())
    def test_dot_3d_1d(self):
        self.cmp_dot(rand(4,5,6), rand(6))

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -5,6 +5,7 @@ import sys
 import theano.tensor as T
 from theano import tensor
 from theano.gof.python25 import product as itertools_product
+from theano.gof.python25 import any
 from theano.printing import pp
 import numpy
@@ -857,7 +858,6 @@ def test_dot22():
                assert _dot22 in [x.op for x in topo], (dtype1, dtype2)
            else:
                check = [isinstance(x.op, T.Dot) for x in topo]
-                from theano.gof.python25 import any
                assert any(check), (dtype1, dtype2)
            rng = numpy.random.RandomState(unittest_tools.fetch_seed())
@@ -1603,6 +1603,13 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
                    self.A, self.a, self.x.dimshuffle(0, 'x'),
                    self.y.dimshuffle('x', 0), self.b(1.5)).owner)
+    def test_b_nonconst_does_not_triggers_ger(self):
+        """ test local_gemm_to_ger opt"""
+        assert not T.blas.local_gemm_to_ger.transform(
+                gemm_no_inplace(
+                    self.A, self.a, self.x.dimshuffle(0, 'x'),
+                    self.y.dimshuffle('x', 0), self.a).owner)
    def test_outer(self):
        f = self.function([self.x, self.y], T.outer(self.x, self.y))
        self.assertFunctionContains(f, self.ger_destructive)

--- a/theano/tests/run_tests_in_batch.py
+++ b/theano/tests/run_tests_in_batch.py
@@ -101,7 +101,7 @@ def main(stdout=None, stderr=None, argv=None, theano_nose=None,
                theano_nose = path
                break
    if theano_nose is None:
-        raise Exception("Not able to find theano_nose")
+        raise Exception("Not able to find theano-nose")
    if batch_size is None:
        batch_size = 100
    stdout_backup = sys.stdout