Merge.

5aac104a · David Warde-Farley · e119b691 · 544f5e7f · 5aac104a · 5aac104a
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -75,6 +75,8 @@ Community

 * Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.

+* Ask/view questions/answers at `metaoptimize/qa/tags/theano/`_ (it's like stack overflow for machine learning)
+
 * We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__ 

 * Come visit us in Montreal!  Most of the developers are students in the LISA_ group at the `University of Montreal`_.
@@ -104,6 +106,8 @@ Community
 .. _theano-buildbot: http://groups.google.com/group/theano-buildbot
 .. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority

+.. _metaoptimize/qa/tags/theano: http://metaoptimize.com/qa/tags/theano/
+
 .. _LISA: http://www.iro.umontreal.ca/~lisa
 .. _University of Montreal: http://www.umontreal.ca

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -344,6 +344,41 @@ def MergeOptMerge(opt):
    return SeqOptimizer([merger, opt, merger])


+def pre_constant_merge(vars):
+    """
+    Merge constants in the subgraph used to compute nodes in `vars`.
+
+    `vars` is a list of nodes, and we want to merge together nodes
+    that are constant inputs used to compute nodes in that list.
+
+    :note: This function will ignore nodes that are in an env.
+           It is used to pre-merge nodes generated inside an optimization,
+           before it is inserted in the env.
+           It is useful if there are many such replacements to make,
+           so that DebugMode will not check each of them.
+    """
+
+    seen_var = set()
+    const_sig = {}     # variable -> variable.signature()  (for constants)
+    const_sig_inv = {} # signature -> variable (for constants)
+    def recursive_merge(var):
+        if var in seen_var:
+            return var
+        if var.owner and hasattr(var.owner, "env"):
+            return var
+        seen_var.add(var)
+        if isinstance(var, graph.Constant):
+            sig = var.signature()
+            if sig in const_sig_inv:
+                return const_sig_inv[sig]
+            const_sig_inv[sig] = var
+            return var
+        if var.owner:
+            for idx,inp in enumerate(var.owner.inputs):
+                var.owner.inputs[idx] = recursive_merge(inp)
+        return var
+
+    return map(recursive_merge, vars)

 ########################
 ### Local Optimizers ###
@@ -1111,6 +1146,66 @@ def check_chain(r, *chain):
    return _check_chain(r, reduce(list.__iadd__, ([x, 0] for x in chain)))


+def pre_greedy_local_optimizer(list_optimizations, out):
+    '''
+    This function traverses the computation graph described by all
+    ``node`` in the graph before the variable out but that are not in the env.
+    it applies each of the local_optimizations on the traversed graph.
+
+    Its main use is to apply locally constant folding when generating
+    the graph of the indices of a subtensor.
+
+    We should not apply optimizations on node that are in env.
+    So we don't optimize node that have an attribute env.
+
+    :note: This don't do an equilibrium... So if there is optimization
+           like local_upcast_elemwise_constant_inputs in the list, that
+           add additional node to the inputs of the node, it can
+           be needed to call this function multiple time.
+    '''
+    def local_recursive_function( list_opt, out, optimized_vars, depth):
+        if not out.owner :
+            return [out], optimized_vars
+        node = out.owner
+        if hasattr(node, 'env'):
+            return node.outputs, optimized_vars
+        for idx, inp in enumerate(node.inputs):
+            if inp in optimized_vars:
+                nw_in = optimized_vars[inp]
+            else:
+                if inp.owner:
+                    outs, optimized_vars = local_recursive_function(
+                        list_opt
+                        , inp
+                        , optimized_vars
+                        , depth+1)
+                    for k,v in zip(inp.owner.outputs, outs):
+                        optimized_vars[k] = v
+                    nw_in = outs[inp.owner.outputs.index(inp)]
+
+                else:
+                    nw_in = inp
+                    optimized_vars[inp] = inp
+            node.inputs[idx] = nw_in
+
+        results = node.outputs
+        for opt in list_opt:
+            ret = opt.transform(node)
+            if ret is not False and ret is not None:
+                assert len(ret) == len(node.outputs)
+                for k,v in zip(node.outputs, ret):
+                    optimized_vars[k] = v
+                results = ret
+                if ret[0].owner :
+                    node = out.owner
+                else:
+                    break
+        return results, optimized_vars
+
+    final_outs, optimized_nodes = local_recursive_function(
+        list_optimizations, out, {}, 0)
+    return final_outs[0]
+




--- a/theano/gof/python25.py
+++ b/theano/gof/python25.py
@@ -69,3 +69,27 @@ else:
     partial = functools.partial
     defaultdict = collections.defaultdict
 __all__ = ['all', 'any']
+
+if sys.version_info[:2] < (2,6):
+    # Borrowed from Python docs
+    def combinations(iterable, r):
+        # combinations('ABCD', 2) --> AB AC AD BC BD CD
+        # combinations(range(4), 3) --> 012 013 023 123
+        pool = tuple(iterable)
+        n = len(pool)
+        if r > n:
+            return
+        indices = range(r)
+        yield tuple(pool[i] for i in indices)
+        while True:
+            for i in reversed(range(r)):
+                if indices[i] != i + n - r:
+                    break
+            else:
+                return
+            indices[i] += 1
+            for j in range(i+1, r):
+                indices[j] = indices[j-1] + 1
+            yield tuple(pool[i] for i in indices)
+else:
+    from itertools import combinations
--- a/theano/misc/check_duplicate_key.py
+++ b/theano/misc/check_duplicate_key.py
+import cPickle
+import os, sys
+
+import theano
+
+DISPLAY_DUPLICATE_KEYS = False
+DISPLAY_MOST_FREQUENT_DUPLICATE_CCODE = False
+
+dirs = []
+if len(sys.argv)>1:
+    for compiledir in sys.argv[1:]:
+        dirs.extend([os.path.join(compiledir,d) for d in os.listdir(compiledir)])
+else:
+    dirs = os.listdir(theano.config.compiledir)
+    dirs = [os.path.join(theano.config.compiledir,d) for d in dirs]
+keys = {} # key -> nb seen
+mods = {}
+for dir in dirs:
+
+    key = None
+    try:
+        f = open(os.path.join(dir, "key.pkl"))
+        key = f.read()
+        f.close()
+        keys.setdefault(key, 0)
+        keys[key]+=1
+        del f
+    except IOError:
+        #print dir, "don't have a key.pkl file"
+        pass
+    try:
+        path = os.path.join(dir, "mod.cpp")
+        if not os.path.exists(path):
+            path = os.path.join(dir, "mod.cu")
+        f = open(path)
+        mod = f.read()
+        f.close()
+        mods.setdefault(mod, ())
+        mods[mod]+=(key,)
+        del mod
+        del f
+        del path
+    except IOError:
+        print dir, "don't have a mod.{cpp,cu} file"
+        pass
+
+if DISPLAY_DUPLICATE_KEYS:
+    for k, v in keys.iteritems():
+        if v > 1:
+            print "Duplicate key (%i copies): %s" % (v, cPickle.loads(k))
+
+nbs_keys = {} # nb seen -> now many key
+for val in keys.values():
+    nbs_keys.setdefault(val, 0)
+    nbs_keys[val]+=1
+
+nbs_mod = {} # nb seen -> how many key
+nbs_mod_to_key = {} #nb seen -> keys
+more_then_one = 0
+for mod,kk in mods.iteritems():
+    val = len(kk)
+    nbs_mod.setdefault(val, 0)
+    nbs_mod[val]+=1
+    if val>1:
+        more_then_one += 1
+    nbs_mod_to_key[val] = kk
+
+if DISPLAY_MOST_FREQUENT_DUPLICATE_CCODE:
+    m = max(nbs_mod.keys())
+    print "The keys associated to the mod.{cpp,cu} with the most number of copy:"
+    for kk in nbs_mod_to_key[m]:
+        kk = cPickle.loads(kk)
+        print kk
+
+print "key.pkl histograph"
+l = nbs_keys.items()
+l.sort()
+print l
+
+print "mod.{cpp,cu} histogram"
+l = nbs_mod.items()
+l.sort()
+print l
+
+total = sum([len(k) for k in mods.values()])
+uniq = len(mods)
+useless = total - uniq
+print "mod.{cpp,cu} total:", total
+print "mod.{cpp,cu} uniq:", uniq
+print "mod.{cpp,cu} with more then 1 copy:", more_then_one
+print "mod.{cpp,cu} useless:", useless, float(useless)/total*100,"%"
+
+print "nb directory", len(dirs)
--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -17,19 +17,20 @@ This don't work with broadcast and non-contiguous memory as pycuda don't support
 import numpy

 import theano
-import theano.tensor as T
 from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
-from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, CudaNdarray
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous, host_from_gpu
+from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType
+from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous
 from theano.sandbox.cuda.opt import gpu_seqopt

+import pycuda_init
+if not pycuda_init.pycuda_available:
+    raise Exception("No pycuda available. You can't load pycuda_example.py")
+
+import pycuda
 from pycuda.elementwise import ElementwiseKernel
 from pycuda.compiler import SourceModule
-from pycuda.gpuarray import splay
 from pycuda.tools import VectorArg

-import pycuda.autoinit
-
 def theano_parse_c_arg(c_arg):
    c_arg = c_arg.replace('npy_float32','float')
    c_arg = c_arg.replace('npy_float64','double')

--- a/theano/misc/pycuda_init.py
+++ b/theano/misc/pycuda_init.py
+import os
+
+import theano
+import theano.sandbox.cuda as cuda
+
+def select_gpu_from_theano():
+    # Transfer the theano gpu binding to pycuda, for consistency
+    theano_to_pycuda_device_map = {"cpu": "0",
+                                   "gpu0": "0",
+                                   "gpu1": "1",
+                                   "gpu2": "2",
+                                   "gpu3": "3"}
+    dev = theano_to_pycuda_device_map.get(theano.config.device, "0")
+    if theano.config.device == 'gpu':
+        dev = str(cuda.cuda_ndarray.cuda_ndarray.active_device_number())
+    os.environ["CUDA_DEVICE"] = dev
+
+select_gpu_from_theano()
+pycuda_available = False
+try:
+    import pycuda
+    import pycuda.autoinit
+    pycuda_available = True
+except ImportError:
+    # presumably, the user wanted to use pycuda, else they wouldn't have
+    # imported this module, so issue a warning that the import failed.
+    import warnings
+    warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
--- a/theano/misc/tests/test_pycuda.py
+++ b/theano/misc/tests/test_pycuda.py
 import numpy

-try:
-    import pycuda
-except ImportError:
+import theano
+import theano.misc.pycuda_init
+
+if not theano.misc.pycuda_init.pycuda_available:
    from nose.plugins.skip import SkipTest
    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")

@@ -14,10 +15,6 @@ if cuda_ndarray.cuda_available == False:
 import theano
 import theano.tensor as T
 from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp
-from theano.sandbox.cuda import GpuContiguous
-import theano.misc.pycuda_example
-
-import theano.sandbox.cuda as cuda_ndarray

 if theano.config.mode=='FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
@@ -37,8 +34,8 @@ def test_pycuda_elemwise_source_module():
    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
    assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])

-    val1 = numpy.random.rand(5,5)
-    val2 = numpy.random.rand(5,5)
+    val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
+    val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
    #val1 = numpy.ones((5,5))
    #val2 = numpy.arange(25).reshape(5,5)
    assert (f(val1,val2) == f2(val1,val2)).all()

--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
+"""
+This file is an example of view the memory allocated by pycuda in a GpuArray
+in a CudaNdarray to be able to use it in Theano.
+
+This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
+"""
+
+import sys
+
+import numpy
+
+import theano
+import theano.sandbox.cuda as cuda_ndarray
+import theano.misc.pycuda_init
+
+if not theano.misc.pycuda_init.pycuda_available:
+    from nose.plugins.skip import SkipTest
+    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
+
+if cuda_ndarray.cuda_available == False:
+    from nose.plugins.skip import SkipTest
+    raise SkipTest('Optional package cuda disabled')
+
+import pycuda
+import pycuda.driver as drv
+import pycuda.gpuarray
+
+
+def test_pycuda_simple():
+    x = cuda_ndarray.CudaNdarray.zeros((5,5))
+
+    from pycuda.compiler import SourceModule
+    mod = SourceModule("""
+__global__ void multiply_them(float *dest, float *a, float *b)
+{
+  const int i = threadIdx.x;
+  dest[i] = a[i] * b[i];
+}
+""")
+
+    multiply_them = mod.get_function("multiply_them")
+
+    a = numpy.random.randn(100).astype(numpy.float32)
+    b = numpy.random.randn(100).astype(numpy.float32)
+
+    dest = numpy.zeros_like(a)
+    multiply_them(
+        drv.Out(dest), drv.In(a), drv.In(b),
+        block=(400,1,1), grid=(1,1))
+    assert (dest==a*b).all()
+
+
+def test_pycuda_memory_to_theano():
+    #Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
+    y = pycuda.gpuarray.zeros((3,4,5), 'float32')
+    print numpy.asarray(y)
+    print "gpuarray ref count before creating a CudaNdarray", sys.getrefcount(y)
+    assert sys.getrefcount(y)==2
+    rand = numpy.random.randn(*y.shape).astype(numpy.float32)
+    cuda_rand = cuda_ndarray.CudaNdarray(rand)
+
+    strides = [1]
+    for i in y.shape[::-1][:-1]:
+        strides.append(strides[-1]*i)
+    strides = tuple(strides[::-1])
+    print 'strides', strides
+    assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
+
+    y_ptr = int(y.gpudata) # in pycuda trunk, y.ptr also works, which is a little cleaner
+    z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
+    print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
+    assert sys.getrefcount(y)==3
+    assert (numpy.asarray(z) == 0).all()
+
+    cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
+    z += cuda_ones
+    assert (numpy.asarray(z) == numpy.ones(y.shape)).all()
+    assert (numpy.asarray(z) == 1).all()
+
+    assert cuda_rand.shape == z.shape
+    assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
+    assert (numpy.asarray(cuda_rand) == rand).all()
+    z += cuda_rand
+    assert (numpy.asarray(z)==(rand+1)).all()
+
+    # Check that the ref count to the gpuarray is right.
+    del z
+    print "gpuarray ref count after deleting the CudaNdarray", sys.getrefcount(y)
+    assert sys.getrefcount(y)==2
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -57,6 +57,9 @@ def debugprint(obj, depth=-1, print_type=False, file=None):
        order = obj.maker.env.toposort()
    elif isinstance(obj, (list, tuple)):
        results_to_print.extend(obj)
+    elif isinstance(obj, gof.Env):
+        results_to_print.extend(obj.outputs)
+        order = obj.toposort()
    else:
        raise TypeError("debugprint cannot print an object of this type", obj)
    for r in results_to_print:
@@ -611,11 +614,18 @@ def pydotprint(fct, outfile=None,


 def pydotprint_variables(vars,
-                         outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
+                         outfile=None,
+                         format='png',
                         depth = -1,
-                         high_contrast = True):
+                         high_contrast = True, colorCodes = None):
    ''' Identical to pydotprint just that it starts from a variable instead
    of a compiled function. Could be useful ? '''
+
+    if colorCodes is None:
+        colorCodes = default_colorCodes
+    if outfile is None:
+        outfile = os.path.join(config.compiledir,'theano.pydotprint.' +
+                               config.device + '.' + format)
    try:
        import pydot as pd
    except:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -156,10 +156,12 @@ def use(device, force=False, default_to_move_computation_to_gpu = True,
        raise EnvironmentError("You forced use of device %s, but CUDA initialization failed "
                               "with error:\n%s" % (device, cuda_initialization_error_message))
    if not cuda_available:
-        if cuda_initialization_error_message:
-            error_addendum = " (error: %s)" % cuda_initialization_error_message
-        else:
-            error_addendum = ""
+        error_addendum = ""
+        try:
+            if cuda_initialization_error_message:
+                error_addendum = " (error: %s)" % cuda_initialization_error_message
+        except NameError: # cuda_initialization_error_message is not available b/c compilation failed
+            pass
        warning('CUDA is installed, but device %s is not available%s' % (device, error_addendum))
        return


--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1767,15 +1767,9 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):

        return Apply(self, [x_, y_, ilist_], [x_.type()])

-    def perform_(self, node, inp, out_):
-        # This don't work as CudaNdarray_Subscript() don't support it.
-        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
-        x, idx = inp
-        out, = out_
-        o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),)+x.shape[1:])
-        for (j,i) in enumerate(idx):
-            o[j] = x[i]
-        out[0] = o
+    #def perform(self, node, inp, out_):
+        # CudaNdarray_Subscript() don't support Advanced slicing.
+        # so we use the parent version that loop on each indices.

 class GpuIncSubtensor(tensor.IncSubtensor):
    def make_node(self, x, y, *inputs):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -630,7 +630,7 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
    // calculate new size, assert same as old size
    if (rval_size != CudaNdarray_SIZE(self))
    {
-        PyErr_SetString(PyExc_ValueError, "size must remain unchanged");
+        PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %i to %i", CudaNdarray_SIZE(self), rval_size);
        free(rval_dims);
        return NULL;
    }
@@ -2010,6 +2010,100 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    return Py_None;
 }

+/*
+ * This function is tested in theano/misc/test_pycuda_theano_simple.py
+ */
+PyObject *
+CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
+{
+    PyObject *gpu_ptr = NULL;
+    PyObject *shapes = NULL;
+    PyObject *strides = NULL;
+    PyObject *base = NULL;
+    PyObject *rval = NULL;
+
+    //args should consist of 3 python objects
+    //The first is the gpu ptr
+    //The second if the shape
+    //The third if the strides
+    if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
+        return NULL;
+
+    printf("In CudaNdarray_from_gpu_pointer\n");
+    if (!PyLong_Check(gpu_ptr))
+    {
+        PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
+	return NULL;
+    }
+
+    Py_ssize_t nd =  PyObject_Length(shapes);
+    if (nd < 0)
+    {
+        PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
+        return NULL;
+    }
+    Py_ssize_t nd_stride =  PyObject_Length(strides);
+    if (nd_stride < 0)
+    {
+        PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
+        return NULL;
+    }
+    
+    if (nd != nd_stride)
+    {
+        PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
+        return NULL;
+    }
+
+    rval = CudaNdarray_new_null();
+
+    if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
+    {
+        //CudaNdarray_set_nd set the error msg
+        return NULL;
+    }
+    // set gpu pointeur
+    assert(((CudaNdarray *)rval)->data_allocated == 0);
+    if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
+    {
+        PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
+        return NULL;
+
+    }
+
+    // Set dims and strides    
+    for (int i = nd-1; i >= 0; --i)
+    {
+        PyObject * idx = PyLong_FromLong(i);
+        if (idx == NULL)
+        {
+            PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
+            return NULL;
+        }
+        PyObject* dim_ = PyObject_GetItem(shapes, idx);
+        PyObject* strd_ = PyObject_GetItem(strides, idx);
+	if (!PyInt_Check(dim_))
+        {
+	    PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
+            return NULL;
+        }
+	if (!PyInt_Check(strd_))
+        {
+	    PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
+            return NULL;
+        }
+	int dim = PyInt_AsLong(dim_);
+	int strd = PyInt_AsLong(strd_);
+        CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
+        CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
+	Py_DECREF(idx);
+	Py_DECREF(dim_);
+	Py_DECREF(strd_);
+    }
+    printf("CudaNdarray_from_gpu_pointer normal return\n");
+    return rval;
+}
+
 PyObject *
 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
 {
@@ -2175,6 +2269,7 @@ static PyMethodDef module_methods[] = {
    {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
+    {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
    {NULL, NULL, NULL, NULL}  /* Sentinel */
 };

@@ -2367,7 +2462,7 @@ CudaNdarray_new_nd(int nd)
    return (PyObject *) rval;
 }

-int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
 {
    if (self->data_allocated)
    {
@@ -2380,10 +2475,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray *
        }
    }
    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
-    if (self->base != (PyObject*)base)
+    if (self->base != base)
    {
        Py_XDECREF(self->base);
-        self->base = (PyObject*)base;
+        self->base = base;
        Py_XINCREF(self->base);
    }
    self->data_allocated = 0;
@@ -2982,18 +3077,20 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
        }
 	else if(dims_taken[pattern[i]])
 	{
-	  PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: The same input dimension may not appear twice in the list of output dimensions");
+                PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
+			     pattern[i]);
 	  free(newdims);
 	  return -1;
 	}
-        else
-        {
-            if ((dims_taken[pattern[i]]) || (pattern[i]>= self->nd))
-            {
-                PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle");
-                free(newdims);
-                return -1;
-            }
+        else if (pattern[i]>= self->nd)
+	{
+	    PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
+			 pattern[i], self->nd);
+	    free(newdims);
+	    return -1;
+	}
+	else
+	{
            newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
            newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
            dims_taken[pattern[i]] = 1;

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -438,7 +438,11 @@ CudaNdarray_NewDims(int nd, const inttype * dims)
 *
 * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
 */
-int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base)
+{
+  return CudaNdarray_set_device_data(self, data, (PyObject *) base);
+}

 /**
 * Return an independent copy of self

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -778,8 +778,8 @@ switch = Switch()
 class UnaryBitOp(UnaryScalarOp):
    def output_types(self, *input_types):
        for i in input_types[0]:
-            if i not in (int8, int32, int64):
-                raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
+            if i not in (int8, int16, int32, int64):
+                raise TypeError('input to a BitOp must have type int8, int16, int32 or int64... not %s' % i)
        return upcast_out(*input_types[0])
    def grad(self, inputs, output_gradients):
        return [None]
@@ -788,8 +788,8 @@ class BinaryBitOp(BinaryScalarOp):
    def output_types(self, *input_types):
        t0, t1 = input_types[0]
        for i in input_types[0]:
-            if i not in (int8, int32, int64):
-                raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
+            if i not in (int8, int16, int32, int64):
+                raise TypeError('input to a BitOp must have type int8, int16, int32 or int64... not %s' % i)
        return upcast_out(*input_types[0])
    def grad(self, inputs, output_gradients):
        return [None, None]
@@ -800,6 +800,8 @@ class OR(BinaryBitOp):
    associative = False
    def impl(self, x, y):
        return x | y
+    def c_code(self, node, name, (x, y), (z, ), sub):
+        return "%(z)s = (%(x)s | %(y)s);" % locals()
 or_ = OR()

 class XOR(BinaryBitOp):
@@ -808,6 +810,8 @@ class XOR(BinaryBitOp):
    associative = False
    def impl(self, x, y):
        return x ^ y
+    def c_code(self, node, name, (x, y), (z, ), sub):
+        return "%(z)s = (%(x)s ^ %(y)s);" % locals()
 xor = XOR()

 class AND(BinaryBitOp):
@@ -816,12 +820,16 @@ class AND(BinaryBitOp):
    associative = False
    def impl(self, x, y):
        return x & y
+    def c_code(self, node, name, (x, y), (z, ), sub):
+        return "%(z)s = (%(x)s & %(y)s);" % locals()
 and_ = AND()

 class Invert(UnaryBitOp):
    identity = False
    def impl(self, x):
        return ~x
+    def c_code(self, node, name, (x,), (z, ), sub):
+        return "%(z)s = (~%(x)s);" % locals()
 invert = Invert()



--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -650,7 +650,9 @@ class AddSD(gof.op.Op):
                                        broadcastable = y.type.broadcastable).make_variable()])
    def perform(self, node, (x, y), (out, )):
        assert _is_sparse(x) and _is_dense(y)
-        out[0] = x + y
+        # The asarray is needed as in some case, this return a
+        # numpy.matrixlib.defmatrix.matrix object and not an ndarray.
+        out[0] = theano._asarray(x + y, dtype=node.outputs[0].type.dtype)
    def grad(self, (x, y), (gz,)):
        assert _is_sparse_variable(x) and _is_dense_variable(y)
        assert _is_dense_variable(gz)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1103,14 +1103,6 @@ def local_argmax_pushdown(node):
 def _check_rows_is_arange_len_labels(rows, labels):
    '''Check that 'rows' is the same node as T.arange(labels.shape[0])'''

-    # this is admittedly a pretty random thing to have here... but it's not wrong (I think)
-    # and it has the effect of making the advanced_indexing -> crossentropy optimization work
-    # in the case where the labels are float32s casted to integers.  "Why would anyone do that?"
-    # you ask... it is a handy trick for storing labels on a pre-FERMI GPU device so that
-    # logistic regression goes faster.
-    if labels.owner and labels.owner.op == tensor._convert_to_int32:
-        labels = labels.owner.inputs[0]
-
    if rows.owner and isinstance(rows.owner.op, tensor.ARange):
        start, stop, step = rows.owner.inputs
        if getattr(start, 'data', None) != 0: #constants will have data
@@ -1119,11 +1111,12 @@ def _check_rows_is_arange_len_labels(rows, labels):
            return False
        if not stop.owner:
            return False
-        # Not sure if that case happens any more after the introduction
-        # of ShapeOptimizer
+
+        # Not sure if that case happens any more after the introduction of
+        # ShapeOptimizer, but we keep it if ShapeOptimizer is not present
        if isinstance(stop.owner.op, tensor.Subtensor):
            shape_subtensor = stop.owner
-            if shape_subtensor.op.idx_list == [0]:
+            if list(shape_subtensor.op.idx_list) == [0]:
                shape_var, = shape_subtensor.inputs
                if shape_var.owner and shape_var.owner.op == tensor._shape:
                    return shape_var.owner.inputs[0] is labels

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -25,7 +25,7 @@ import basic as T
 from theano import compile  #to register the optimizer built by this file

 from theano.gof.python25 import any, all
-from theano.gof.opt import Optimizer
+from theano.gof.opt import Optimizer, pre_constant_merge, pre_greedy_local_optimizer
 from theano.gof import toolbox, DestroyHandler
 from basic import get_constant_value

@@ -602,13 +602,66 @@ class ShapeFeature(object):
                    s_i, type(s_i), getattr(s_i, 'type', None))

    def set_shape(self, r, s):
-        assert r not in self.shape_of
+        assert r not in self.shape_of, 'r already in shape_of'
        if s is None:
            self.shape_of[r] = s
        else:
            self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])

-    def init_r(self,r):
+    def update_shape(self, r, other_r):
+        '''Replace shape of r by shape of other_r.
+
+        If, on some dimensions, the shape of other_r is not informative,
+        keep the shape of r on those dimensions.
+        '''
+        # other_r should already have a shape
+        assert other_r in self.shape_of, ('other_r not in shape_of', other_r)
+        other_shape = self.shape_of[other_r]
+
+        if r in self.shape_of:
+            r_shape = self.shape_of[r]
+        else:
+            # If no info is known on r's shape, use other_shape
+            self.shape_of[r] = other_shape
+            return
+
+        # If other_shape has no information, use r_shape
+        if other_shape is None:
+            self.shape_of[r] = r_shape
+            return
+
+        # Merge other_shape with r_shape, giving the priority to other_shape
+        merged_shape = []
+        for i, ps in enumerate(other_shape):
+            # If other_shape[i] is uninformative, use r_shape[i].
+            # For now, we consider 2 cases of uninformative other_shape[i]:
+            #  - Shape_i(i)(other_r);
+            #  - Shape_i(i)(r).
+            if (ps.owner and
+                    isinstance(getattr(ps.owner,'op',None), Shape_i) and
+                    ps.owner.op.i == i and
+                    ps.owner.inputs[0] in (r, other_r)):
+                merged_shape.append(r_shape[i])
+            else:
+                merged_shape.append(other_shape[i])
+        self.shape_of[r] = tuple(merged_shape)
+
+    def set_shape_i(self, r, i, s_i):
+        '''Replace element i of shape_of[r] by s_i'''
+        assert r in self.shape_of
+        prev_shape = self.shape_of[r]
+        # prev_shape is a tuple, so we cannot change it inplace,
+        # so we build another one.
+        new_shape = []
+        for j, s_j in enumerate(prev_shape):
+            if j == i:
+                new_shape.append(self.unpack(s_i))
+            else:
+                new_shape.append(s_j)
+        self.shape_of[r] = tuple(new_shape)
+
+    def init_r(self, r):
+        '''Register r's shape in the shape_of dictionary.'''
        if r not in self.shape_of:
            try:
                self.set_shape(r, self.shape_tuple(r))
@@ -619,7 +672,7 @@ class ShapeFeature(object):
        return make_vector(*self.shape_of[r])
    #
    #
-    # Feature inteface
+    # Feature interface
    #
    #
    def on_attach(self, env):
@@ -669,10 +722,10 @@ class ShapeFeature(object):
            self.set_shape(r, s)

    def on_change_input(self, env, node, i, r, new_r):
-        # TODO:
        # This tells us that r and new_r must have the same shape
        # if we didn't know that the shapes are related, now we do.
-        self.init_r(new_r)
+        self.update_shape(new_r, r)
+
        # change_input happens in two cases:
        # 1) we are trying to get rid of r, or
        # 2) we are putting things back after a failed transaction.
@@ -690,6 +743,15 @@ class ShapeFeature(object):
            if v == r:
                del self.scheduled[k]

+        # In either case, r could be in shape_of.values(), that is, r itself
+        # is the shape of  something. In that case, we want to update
+        # the value in shape_of, to keep it up-to-date.
+        for k,v in self.shape_of.iteritems():
+            if v is not None:
+                for ii, vi in enumerate(v):
+                    if vi == r:
+                        self.set_shape_i(k, ii, new_r)
+
 class ShapeOptimizer(Optimizer):
    """Optimizer that serves to add ShapeFeature as an env feature.
    """
@@ -1125,8 +1187,6 @@ def local_useless_subtensor(node):
                node_input_idx += sum([isinstance(idx.start, theano.scalar.Scalar),
                                       isinstance(idx.stop, theano.scalar.Scalar),
                                       isinstance(idx.step, theano.scalar.Scalar)])
-            if isinstance(idx, theano.scalar.Scalar):
-                node_input_idx += 1
        return [node.inputs[0]]


@@ -1171,6 +1231,7 @@ def local_subtensor_lift(node):
                            new_inputs.append(i.dimshuffle(['x']*node.outputs[0].ndim))
                return [u.owner.op(*new_inputs)]

+
 def merge_two_slices(slice1, len1, slice2, len2):
    '''
     This function merges two slices into a single slice. The code works on
@@ -1186,18 +1247,7 @@ def merge_two_slices(slice1, len1, slice2, len2):
    ``len1`` is the length of the tensor **before** applying the first slice,
    while ``len2`` is the length **after** applying the first slice.
    '''
-    def const_fold(n):
-        while True:
-            ret = constant_folding.transform(n)
-            if ret is not False and ret is not None:
-                #print n,ret
-                assert len(ret)==len(n.outputs)
-                assert len(ret)==1
-                n = ret[0].owner
-            else: break
-
-        return n.outputs
-
+    list_opt = [ local_abs_merge, local_mul_switch_sink, local_upcast_elemwise_constant_inputs, local_remove_switch_const_cond, constant_folding ]


    if type(slice1) is not slice:
@@ -1250,38 +1300,65 @@ def merge_two_slices(slice1, len1, slice2, len2):
        # according to the two steps we have 4 different combinations of
        # positive/negative. I will denote the case I'm looking at by
        # suffixes to the variables (nn,np,pn,pp):
-        pp_start = sl1.start + sl2.start * sl1.step
-        pp_stop  = sl1.start + sl2.stop  * sl1.step
-        pp_step  = sl1.step  * sl2.step
+        flen = sl2.stop - sl2.start
+        p_step  = sl1.step  * sl2.step
+        n_step  = sl1.step  * sl2.step  * -1
+
+
+
+        pp_start = T.minimum(sl1.start + sl2.start * sl1.step, sl1.stop)
+        pp_stop  = T.minimum(sl1.start + sl2.stop  * sl1.step, sl1.stop)
+
+
+        pn_stop  = sl1.start + (sl2.start -1) * sl1.step
+        pn_stop  = T.switch(T.and_(T.lt(pn_stop,0)
+                                   , T.gt(flen,0))
+                            , -len1 -1
+                            , T.minimum(pn_stop, sl1.stop))
+        pn_start = sl1.start + (sl2.stop -1)  * sl1.step
+        pn_start = T.minimum( pn_start, sl1.stop )
+        pn_start = T.maximum( pn_start, 0 )

-        pn_stop  = sl1.start + sl2.start * sl1.step
-        pn_start = sl1.start + sl2.stop  * sl1.step
-        pn_step  = sl1.step  * sl2.step  * -1
-        pn_stop  = T.switch(T.eq(pn_stop,-1), -len1 -1, pn_stop)

        np_stop  = sl1.stop - sl2.stop  * sl1.step -1
-        np_start = sl1.stop - sl2.start * sl1.step -1
-        np_step  = sl1.step * sl2.step  * -1
-        np_stop  = T.switch(T.eq(np_stop,-1), -len1 -1, np_stop)
+        np_stop  = T.switch(T.and_(T.lt(np_stop,0)
+                                   , T.gt(flen,0))
+                            ,-len1-1
+                            , T.maximum(sl1.start-1, np_stop))
+        np_start = T.maximum(sl1.start,sl1.stop - sl2.start * sl1.step -1)
+
+        nn_start = T.maximum(sl1.start,(sl1.stop -1)- (sl2.stop-1) * sl1.step)
+        nn_stop  = T.maximum(sl1.start,sl1.stop - sl2.start * sl1.step)

-        nn_start = sl1.stop - sl2.start * sl1.step
-        nn_stop  = sl1.stop - sl2.stop  * sl1.step
-        nn_step  = sl1.step * sl2.step

-        start = const_fold(T.switch(T.lt(reverse2*reverse1,0),
+        start = T.switch(T.lt(reverse2*reverse1,0),
                         T.switch(T.lt(reverse1,0), np_start, pn_start),
                         T.switch(T.lt(reverse1,0), nn_start,
-                                  pp_start)).owner)[0]
+                                  pp_start))

-        stop  = const_fold(T.switch(T.lt(reverse2*reverse1,0),
+        stop  = T.switch(T.lt(reverse2*reverse1,0),
                         T.switch(T.lt(reverse1,0), np_stop , pn_stop ),
                         T.switch(T.lt(reverse1,0), nn_stop , pp_stop
-                                 )).owner)[0]
+                                 ))
+
+        step  = T.switch( T.lt(reverse2*reverse1,0),n_step, p_step)
+        start = T.switch(T.le(flen,0), 0, start)
+        stop  = T.switch(T.le(flen,0), 0, stop)
+
+        # The canonical form of the slice is pretty complicated
+        # and is not simplified. We simplify it in advance here
+        # as otherwise this create too many useless optimization that
+        # DebugMode must check.
+        start = pre_greedy_local_optimizer( list_opt, start)
+        stop  = pre_greedy_local_optimizer( list_opt, stop)
+        step  = pre_greedy_local_optimizer( list_opt, step)
+        start = pre_greedy_local_optimizer( list_opt, start)
+        stop  = pre_greedy_local_optimizer( list_opt, stop)
+        step  = pre_greedy_local_optimizer( list_opt, step)
+
+        #Pre merge constant for the same reason.
+        start, stop, step = pre_constant_merge([start, stop, step])

-        step  = const_fold( T.switch(T.lt(reverse2*reverse1,0),
-                         T.switch(T.lt(reverse1,0), np_step , pn_step ),
-                         T.switch(T.lt(reverse1,0), nn_step , pp_step
-                                 )).owner)[0]
        return slice(start, stop, step)

 @register_canonicalize

--- a/theano/tensor/tests/mlp_test.py
+++ b/theano/tensor/tests/mlp_test.py
@@ -302,10 +302,11 @@ def test_mlp():
                x:train_set_x[index*batch_size:(index+1)*batch_size],
                y:train_set_y[index*batch_size:(index+1)*batch_size]},
            mode=mode)
-    for i in train_model.maker.env.toposort(): print i
-    #theano.printing.pydotprint(train_model)
+    print 'MODEL 1'
+    theano.printing.debugprint(train_model, print_type=True)
+    assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])

-    assert any( [isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
+    # Now, this case works, too!
    train_model =theano.function( inputs = [index],
            updates = updates2,
            mode=mode.excluding('local_track_shape_i'),
@@ -313,9 +314,21 @@ def test_mlp():
                x:train_set_x[index*batch_size:(index+1)*batch_size],
                y:train_set_y[index*batch_size:(index+1)*batch_size]})
    print
-    for i in train_model.maker.env.toposort(): print i
+    print 'MODEL 2'
+    theano.printing.debugprint(train_model, print_type=True)
+    assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])

-    assert not any( [isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])
+    # Even without FeatureShape
+    train_model =theano.function( inputs = [index],
+            updates = updates2,
+            mode=mode.excluding('local_shape_to_shape_i'),
+            givens={
+                x:train_set_x[index*batch_size:(index+1)*batch_size],
+                y:train_set_y[index*batch_size:(index+1)*batch_size]})
+    print
+    print 'MODEL 3'
+    theano.printing.debugprint(train_model, print_type=True)
+    assert any([isinstance(i.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for i in train_model.maker.env.toposort()])

 if __name__ == '__main__':
    test_mlp()
--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -765,17 +765,26 @@ class TestGemv(TestCase):
    def test_gemv_dimensions(self):
        A = T.matrix('A')
        x, y = T.vectors('x', 'y')
-        alpha = theano.shared(1.0, name='alpha')
-        beta = theano.shared(1.0, name='beta')
+        alpha = theano.shared(theano._asarray(1.0, dtype=config.floatX),
+                name='alpha')
+        beta = theano.shared(theano._asarray(1.0, dtype=config.floatX),
+                name='beta')

        z = beta * y + alpha * T.dot(A, x)
        f = theano.function([A, x, y], z)

+        # Matrix value
        A_val = numpy.ones((5,3), dtype=config.floatX)
-        f(A_val, numpy.ones(3), numpy.ones(5))
-        self.assertRaises(ValueError, f, A_val, numpy.ones(4), numpy.ones(5))
-        self.assertRaises(ValueError, f, A_val, numpy.ones(3), numpy.ones(6))
-        self.assertRaises(ValueError, f, A_val, numpy.ones(4), numpy.ones(6))
+        # Different vector length
+        ones_3 = numpy.ones(3, dtype=config.floatX)
+        ones_4 = numpy.ones(4, dtype=config.floatX)
+        ones_5 = numpy.ones(5, dtype=config.floatX)
+        ones_6 = numpy.ones(6, dtype=config.floatX)
+
+        f(A_val, ones_3, ones_5)
+        self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
+        self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
+        self.assertRaises(ValueError, f, A_val, ones_4, ones_6)


 # The following gemv tests were added in March 2011 by Ian Goodfellow

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1526,6 +1526,41 @@ class test_local_subtensor_merge(unittest.TestCase):



+    def test_scalar5(self):
+        # var[int1:][:int2]
+        x = TT.matrix('x')
+        b1 = TT.iscalar('b1')
+        e1 = TT.iscalar('e1')
+        s1 = TT.iscalar('s1')
+        b2 = TT.iscalar('b2')
+        e2 = TT.iscalar('e2')
+        s2 = TT.iscalar('s2')
+        f = function([x,b1,e1,s1,b2,e2,s2], x[b1:e1:s1][b2:e2:s2], mode=mode_opt)
+        #theano.printing.debugprint(f, print_type=True)
+
+        topo=f.maker.env.toposort()
+        #print [t for t in topo if isinstance(t.op, TT.Subtensor)]
+        assert len([t for t in topo if isinstance(t.op, TT.Subtensor)]) == 1
+        #print topo[-1].op
+        assert isinstance(topo[-1].op, theano.compile.function_module.DeepCopyOp)
+
+        b1r = self.rng.permutation(range(-8,8))[:2]
+        e1r = self.rng.permutation(range(-8,8))[:2]
+        b2r = self.rng.permutation(range(-8,8))[:2]
+        e2r = self.rng.permutation(range(-8,8))[:2]
+
+        s1r = self.rng.permutation([-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7])[:2]
+        s2r = self.rng.permutation([-7,-6,-5,-4,-3,-2,-1,1,2,3,4,5,6,7])[:2]
+
+        for x_s in self.x_shapes:
+            x_val = self.rng.uniform(size=x_s).astype(config.floatX)
+            for b1 in b1r:
+                for e1 in e1r:
+                    for s1 in s1r:
+                        for b2 in b2r:
+                            for e2 in e2r:
+                                for s2 in s2r:
+                                    f(x_val, b1,e1,s1,b2,e2,s2)


 def test_local_fill_useless():
@@ -1635,7 +1670,7 @@ class test_shapeoptimizer(unittest.TestCase):
        register_specialize(local_identity_noshape_to_identity_shape)

        # With the optimization
-        # The identity_shape op is should not be needed anymore to compute
+        # The identity_shape op should not be needed anymore to compute
        # the shape
        g = theano.function([x], ins_x.shape, mode=mode)
        xval = rng.randn(6,1,2).astype(config.floatX)
@@ -1995,7 +2030,7 @@ class T_useless_elemwise(unittest.TestCase):
        # tensor_copy, and view
        x = T.matrix()
        f = theano.function([x], T.tensor_copy(x), mode=self.mode)
-        vx = numpy.random.rand(5,4)
+        vx = numpy.random.rand(5,4).astype(config.floatX)
        f(vx)
        topo = f.maker.env.toposort()
        assert len(topo) == 1