Merged

6736be29 · Olivier Delalleau · 0832b113 · e4bb7837 · 6736be29 · 6736be29
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -131,7 +131,7 @@ optdb.register('merge1', gof.MergeOptimizer(),
        0, 'fast_run', 'fast_compile')
 optdb.register('canonicalize', gof.EquilibriumDB(),         # rearranges elemwise expressions
        1, 'fast_run')
-optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=True),
+optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
        1.2, 'fast_run', 'fast_compile')
 optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable subgraphs
        1.5, 'fast_run')          

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -956,21 +956,26 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
    CudaNdarray * self = (CudaNdarray*) py_self;
    PyObject * py_rval = NULL;
    CudaNdarray * rval = NULL;
+    PyObject * intobj = NULL;
+    //PyObject_Print(key, stderr, 0);
    if (key == Py_Ellipsis)
    {
        Py_INCREF(py_self);
        return py_self;
    }
-    else if (PyInt_Check(key)) //INDEXING BY INTEGER
+    if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
+    //else if (PyInt_Check(key)) //INDEXING BY INTEGER
    {
+        int d_idx = PyInt_AsLong(intobj);
+        Py_DECREF(intobj); intobj=NULL;
+        //int d_idx = PyInt_AsLong(key);
        if (self->nd == 0)
        {
            PyErr_SetString(PyExc_NotImplementedError, "index into 0-d array");
            return NULL;
        }
-        int d_idx = PyInt_AsLong(key);
        int d_dim = CudaNdarray_HOST_DIMS(self)[0];
        int offset = 0;
@@ -1009,7 +1014,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
            CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
        }
    }
-    else if (PySlice_Check(key)) //INDEXING BY SLICE
+    else
+    {
+        PyErr_Clear();
+    }
+    if (PySlice_Check(key)) //INDEXING BY SLICE
    {
        if (self->nd == 0)
        {
@@ -1057,7 +1066,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
            CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
        }
    }
-    else if (PyTuple_Check(key)) //INDEXING BY TUPLE
+    if (PyTuple_Check(key)) //INDEXING BY TUPLE
    {
        //elements of the tuple can be either integers or slices
        //the dimensionality of the view we will return is diminished for each slice in the tuple
@@ -1127,9 +1136,11 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
                    }
                    ++rval_d;
                }
-                else if (PyInt_Check(key_d))
+                else if ((intobj=PyNumber_Int(key_d)))
                {
-                    int d_idx = PyInt_AsLong(key_d);
+                    int d_idx = PyInt_AsLong(intobj);
+                    Py_DECREF(intobj);
+                    intobj = NULL;
                    int d_dim = CudaNdarray_HOST_DIMS(self)[d];
                    if ((d_idx >= 0) && (d_idx < d_dim))
@@ -1151,6 +1162,7 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
                }
                else
                {
+                    PyErr_Clear(); // clear the error set by PyNumber_Int
                    PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
                    Py_DECREF(rval);
                    return NULL;
@@ -1158,16 +1170,16 @@ CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
            }
        }
    }
-    else
-    {
-        PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
-        return NULL;
-    }
    if (py_rval)
    {
        if (verbose) fprint_CudaNdarray(stderr, self);
        if (verbose) fprint_CudaNdarray(stderr, rval);
    }
+    else
+    {
+        PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
+        return NULL;
+    }
    return py_rval;
 }
@@ -1776,6 +1788,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other)
        }
        size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
    }
+    if (0 == size)
+    {
+        return 0; //nothing to copy, we're done.
+    }
    if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other))
    {
        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -9,7 +9,12 @@ import sys
 import numpy
 from theano import Op, Apply, shared, config
-from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast
+from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
+from theano.compile import optdb
+from theano.gof import local_optimizer
+from theano.sandbox.cuda.opt import register_opt as gpu_register_opt
+from theano.sandbox.cuda import cuda_enabled, CudaNdarrayType #, gpu_from_host, host_from_gpu, CudaNdarrayType
 def mulmod(a, b, c, m):
    r = numpy.int32(numpy.int64(a*b + c) % m)
@@ -114,8 +119,9 @@ def mrg_next_value(rstate, new_rstate):
    else:
        return (x11 - x21) * NORM
-class mrg_uniform(Op):
+class mrg_uniform_base(Op):
    def __init__(self, output_type, inplace=False):
+        Op.__init__(self)
        self.output_type = output_type
        self.inplace=inplace
        if inplace:
@@ -129,6 +135,18 @@ class mrg_uniform(Op):
    def __hash__(self):
        return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
+    def make_node(self, rstate, size):
+        # error checking slightly redundant here, since
+        # this op should not be called directly.
+        #
+        # call through MRG_RandomStreams instead.
+        return Apply(self, 
+                [rstate, size], 
+                [rstate.type(), self.output_type()])
+class mrg_uniform(mrg_uniform_base):
+    #CPU VERSION
    @classmethod
    def new(cls, rstate, ndim, dtype, size):
        v_size = as_tensor_variable(size)
@@ -137,12 +155,10 @@ class mrg_uniform(Op):
        op = cls(TensorType(dtype, (False,)*ndim))
        return op(rstate, cast(v_size, 'int32'))
-    def make_node(self, rstate, size):
-        return Apply(self, 
-                [rstate, size], 
-                [rstate.type(), self.output_type()])
    def perform(self, node, (rstate, size), (o_rstate, o_sample)):
        n_elements = 1
+        rstate = numpy.asarray(rstate) # bring state from GPU if necessary
        if not self.inplace:
            rstate = rstate.copy()
@@ -157,8 +173,8 @@ class mrg_uniform(Op):
            sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
            rval[i] = sample
-        o_rstate[0] = rstate.copy()
+        o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
-        o_sample[0] = rval.reshape(size)
+        o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
    def c_code_cache_version(self):
        return ()
@@ -317,10 +333,223 @@ class mrg_uniform(Op):
        //////// </ code generated by mrg_uniform>
        """ %locals()
+class GPU_mrg_uniform(mrg_uniform_base):
+    #GPU VERSION
+    @classmethod
+    def new(cls, rstate, ndim, dtype, size):
+        v_size = as_tensor_variable(size)
+        if ndim is None:
+            ndim = get_vector_length(v_size)
+        op = cls(CudaNdarrayType((False,)*ndim))
+        return op(rstate, cast(v_size, 'int32'))
+    def c_support_code_apply(self, node, nodename):
+        if self.output_type.dtype == 'float32':
+            otype = 'float' 
+            NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
+            # this was determined by finding the biggest number such that
+            # numpy.float32(number * M1) < 1.0
+        else:
+            otype = 'double' 
+            NORM = '4.656612873077392578125e-10'
+        return """
+        static __global__ void %(nodename)s_mrg_uniform(
+                %(otype)s*sample_data,
+                npy_int32*state_data,
+                const int Nsamples)
+        {
+            const npy_int32 i0 = 0;
+            const npy_int32 i7 = 7;
+            const npy_int32 i9 = 9;
+            const npy_int32 i15 = 15;
+            const npy_int32 i16 = 16;
+            const npy_int32 i22 = 22;
+            const npy_int32 i24 = 24;
+            const npy_int32 M1 = 2147483647;      //2^31 - 1
+            const npy_int32 M2 = 2147462579;      //2^31 - 21069
+            const npy_int32 MASK12 = 511;       //2^9 - 1
+            const npy_int32 MASK13 = 16777215;  //2^24 - 1
+            const npy_int32 MASK2 = 65535;      //2^16 - 1
+            const npy_int32 MULT2 = 21069;
+            const unsigned int numThreads = blockDim.x * gridDim.x;
+            const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+            npy_int32 y1, y2, x11, x12, x13, x21, x22, x23;
+            x11 = state_data[idx*6+0];
+            x12 = state_data[idx*6+1];
+            x13 = state_data[idx*6+2];
+            x21 = state_data[idx*6+3];
+            x22 = state_data[idx*6+4];
+            x23 = state_data[idx*6+5];
+            for (int i = idx; i < Nsamples; i += numThreads)
+            {
+                y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24);
+                if ((y1 < 0 || y1 >= M1))     //must also check overflow
+                    y1 -= M1;
+                y1 += x13;
+                if ((y1 < 0 or y1 >= M1))
+                    y1 -= M1;
+                x13 = x12;
+                x12 = x11;
+                x11 = y1;
+                y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16));
+                if (y1 < 0 || y1 >= M2)
+                    y1 -= M2;
+                y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16));
+                if (y2 < 0 || y2 >= M2)
+                    y2 -= M2;
+                y2 += x23;
+                if (y2 < 0 || y2 >= M2)
+                    y2 -= M2;
+                y2 += y1;
+                if (y2 < 0 or y2 >= M2)
+                    y2 -= M2;
+                x23 = x22;
+                x22 = x21;
+                x21 = y2;
+                if (x11 <= x21) {
+                    sample_data[i] = (x11 - x21 + M1) * %(NORM)s;
+                }
+                else
+                {
+                    sample_data[i] = (x11 - x21) * %(NORM)s;
+                }
+            }
+            state_data[idx*6+0]= x11;
+            state_data[idx*6+1]= x12;
+            state_data[idx*6+2]= x13;
+            state_data[idx*6+3]= x21;
+            state_data[idx*6+4]= x22;
+            state_data[idx*6+5]= x23;
+        }  
+        """ %locals()
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, nodename, (rstate, size), (o_rstate, o_sample), sub):
+        inplace = int(self.inplace)
+        ndim = self.output_type.ndim
+        o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
+        fail = sub['fail']
+        if self.output_type.dtype == 'float32':
+            otype = 'float' 
+        else:
+            otype = 'double' 
+        SYNC="CNDA_THREAD_SYNC";
+        return """
+        //////// <code generated by mrg_uniform>
+        int odims[%(ndim)s];
+        int n_elements = 1;
+        unsigned int n_streams;
+        int must_alloc_sample = ((NULL == %(o_sample)s)
+                || !CudaNdarray_Check(py_%(o_sample)s)
+                || (%(o_sample)s->nd != %(ndim)s));
+        if (%(size)s->nd != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "size must be vector");
+            %(fail)s
+        }
+        if (%(size)s->dimensions[0] != %(ndim)s)
+        {
+            PyErr_Format(PyExc_ValueError, "size must have length %%i", %(ndim)s);
+            %(fail)s
+        }
+        if (%(size)s->descr->type_num != PyArray_INT32)
+        {
+            PyErr_SetString(PyExc_ValueError, "size must be int32");
+            %(fail)s
+        }
+        for (int i = 0; i < %(ndim)s; ++i)
+        {
+            odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
+            n_elements *= odims[i];
+            must_alloc_sample = (must_alloc_sample 
+                    || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
+        }
+        if (must_alloc_sample)
+        {
+            Py_XDECREF(%(o_sample)s);
+            %(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
+            if(!%(o_sample)s)
+            {
+                %(fail)s;
+            }
+        }
+        if (!CudaNdarray_Check(py_%(rstate)s))
+        {
+            PyErr_Format(PyExc_ValueError, "rstate must be cudandarray");
+            %(fail)s;
+        }
+        Py_XDECREF(%(o_rstate)s);
+        if (%(inplace)s)
+        {
+            Py_INCREF(%(rstate)s);
+            %(o_rstate)s = %(rstate)s;
+        }
+        else
+        {
+            %(o_rstate)s = (CudaNdarray*)CudaNdarray_Copy(%(rstate)s);
+        }
+        if (%(o_rstate)s->nd != 1)
+        {
+            PyErr_SetString(PyExc_ValueError, "rstate must be vector");
+            %(fail)s;
+        }
+        if (CudaNdarray_HOST_DIMS(%(o_rstate)s)[0] %% 6)
+        {
+            PyErr_Format(PyExc_ValueError, "rstate len must be multiple of 6");
+            %(fail)s;
+        }
+        n_streams = std::min(CudaNdarray_HOST_DIMS(%(o_rstate)s)[0]/6, n_elements);
+        {
+            unsigned int threads_per_block = std::min(n_streams, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
+            unsigned int n_blocks = std::min(ceil_intdiv(n_streams, threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
+            if (threads_per_block * n_blocks < n_streams)
+            {
+                fprintf(stderr, "WARNING: unused streams above %%i (Tune GPU_mrg get_n_streams)\\n", threads_per_block * n_blocks );
+            }
+            %(nodename)s_mrg_uniform<<<n_blocks,threads_per_block>>>(
+                CudaNdarray_DEV_DATA(%(o_sample)s),
+                (npy_int32*)CudaNdarray_DEV_DATA(%(o_rstate)s),
+                n_elements);
+        }
+        %(SYNC)s;
+        {
+            cudaError_t err = cudaGetLastError();
+            if( cudaSuccess != err) 
+            {
+                PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "mrg_uniform", cudaGetErrorString(err));
+                %(fail)s;
+            }                         
+        }
+        //////// </ code generated by mrg_uniform>
+        """ %locals()
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""
-    def __init__(self, seed=None):
+    def __init__(self, seed=12345, use_cuda=None):
        """
        :type seed: None or int
@@ -328,7 +557,16 @@ class MRG_RandomStreams(object):
        `RandomStreamsInstance.__init__` for more details.
        """
        super(MRG_RandomStreams, self).__init__()
-        self.rstate = numpy.asarray([12345]*6, dtype='int32')
+        if isinstance(seed, int):
+            self.rstate = numpy.asarray([seed]*6, dtype='int32')
+        elif len(seed)==6:
+            self.rstate = numpy.asarray(seed, dtype='int32')
+        else:
+            raise TypeError("seed should be 1 integer or 6 integers")
+        if use_cuda is None:
+            self.use_cuda = cuda_enabled
+        else:
+            self.use_cuda = use_cuda
    def inc_rstate(self):
        """Update self.rstate to be skipped 2^134 steps forward to the next stream start"""
@@ -350,10 +588,19 @@ class MRG_RandomStreams(object):
        return rval
    def n_streams(self, size):
-        r = 1
+        if isinstance(size, (tuple, list)):
-        for s in size:
+            r = 1
-            r *= s
+            for s in size:
-        return r
+                r *= s
+            return r
+        try:
+            rval =  int(size)
+            assert rval > 0
+            return rval
+        except:
+            pass
+        print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size)
+        return 30*256
    def pretty_return(self, node_rstate, new_rstate, sample):
        sample.rstate = node_rstate
@@ -361,7 +608,6 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample
    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX):
        """
        Sample a tensor of given size whose element from a uniform
@@ -371,15 +617,50 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.
        """
-        node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
+        if self.use_cuda and dtype=='float32':
-        u = self.pretty_return(node_rstate, 
+            rstates = self.get_substream_rstates(self.n_streams(size))
-                *mrg_uniform.new(node_rstate, ndim, dtype, size))
+            rstates = rstates.flatten()
+            # HACK - we use fact that int32 and float32 have same size to 
+            # sneak ints into the CudaNdarray type.
+            # these *SHOULD NEVER BE USED AS FLOATS*
+            tmp_float_buf = numpy.frombuffer(rstates.data, dtype='float32')
+            assert tmp_float_buf.shape == rstates.shape
+            assert tmp_float_buf.data[:24] == rstates.data[:24]
+            node_rstate = shared(tmp_float_buf) # transfer to device
+            assert isinstance(node_rstate.type, CudaNdarrayType)
+            # we can't use the normal mrg_uniform constructor + later optimization
+            # because of the tmp_float_buf hack above.  There is
+            # currently no Theano node that will do a frombuffer reinterpretation.
+            u = self.pretty_return(node_rstate, 
+                    *GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
+        else:
+            node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
+            u = self.pretty_return(node_rstate, 
+                    *mrg_uniform.new(node_rstate, ndim, dtype, size))
        r = u * (high-low) + low
        if u.type.broadcastable != r.type.broadcastable:
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
        return  r
+    def binomial(self, size=None, n=1, prob=0.5, ndim=None, dtype='int64'):
+        if n == 1:
+            return cast(self.uniform(size=size) < prob, dtype)
+        else:
+            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
+@local_optimizer([None])
+def mrg_random_make_inplace(node):
+    op = node.op
+    if isinstance(op, mrg_uniform) and not op.inplace:
+        # op might be gpu version
+        new_op = op.__class__(op.output_type, inplace=True)
+        return new_op.make_node(*node.inputs).outputs
+    return False
+optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
 #
 #
 #
@@ -391,37 +672,61 @@ import theano
 def test_rng0():
    def basictest(f, steps, prefix=""):
-        t0 = time.time()
+        dt = 0.0
-        l = [f() for i in xrange(steps)]
+        for i in xrange(steps):
-        tt = time.time()
+            t0 = time.time()
+            ival = f()
-        mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l)
+            dt += time.time() - t0
+            ival = numpy.asarray(ival)
-        print prefix, 'mean', mean
+            if i == 0:
-        print prefix, 'std', std
+                mean = numpy.array(ival, copy=True)
-        print prefix, 'min', repr(min)
+            else:
-        print prefix, 'max', repr(max)
+                alpha = 1.0 / (1+i)
-        print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / (tt-t0)
+                mean = alpha * ival + (1-alpha)*mean
-        assert max < 1.0
+        print prefix, 'mean', numpy.mean(mean)
-        assert min >= 0.0
+        assert abs(numpy.mean(mean) - 0.5) < .01, 'bad mean?'
-        assert abs(mean - 0.5) < .01, 'bad mean?'
+        print prefix, 'time', dt
+        print prefix, 'elements', steps*sample_size[0]*sample_size[1]
+        print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / dt
-    R = MRG_RandomStreams(234)
+        if 0:
+            mean, std, min, max = numpy.mean(l), numpy.std(l), numpy.min(l), numpy.max(l)
-    sample_size = (200,20)
+            print prefix, 'mean', mean
+            print prefix, 'std', std
+            print prefix, 'min', repr(min)
+            print prefix, 'max', repr(max)
+            assert max < 1.0
+            assert min >= 0.0
+            assert abs(mean - 0.5) < .01, 'bad mean?'
+    sample_size = (1000,100)
+    print ''
+    print 'ON CPU:'
+    R = MRG_RandomStreams(234, use_cuda=False)
    u = R.uniform(size=sample_size)
-    print "U dtype", u.dtype
    f = theano.function([], u)
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', f()[0,0:10]
+    basictest(f, 1000, prefix='mrg  ')
-    print 'random?', f()[0]
+    print ''
-    print 'random?', f()[0]
+    print 'ON GPU:'
+    R = MRG_RandomStreams(234, use_cuda=True)
+    u = R.uniform(size=sample_size)
+    assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
+    f = theano.function([], theano.Out(
+        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
+        borrow=True))
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
    basictest(f, 1000, prefix='mrg  ')
+    print ''
+    print 'ON CPU w NUMPY:'
    RR = theano.tensor.shared_randomstreams.RandomStreams(234)
    uu = RR.uniform(size=sample_size)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1257,7 +1257,7 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
    def __eq__(self, other):
        return (type(self) == type(other)) and (self.val == other.val)
    def __hash__(self):
-        return tensor.hashtype(self) ^ hash(self.val.value)
+        return tensor.hashtype(self) ^ hash(self.val.data)
    def __str__(self):
        return '%s{%s}'%(self.__class__.__name__,self.val)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -610,6 +610,43 @@ def local_alloc_unary(node):
            return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)]
+############################
+# Constant Canonicalization
+############################
+@register_canonicalize
+@gof.local_optimizer([])
+def local_upcast_elemwise_constant_inputs(node):
+    """This explicitly upcasts constant inputs to elemwise Ops, when those Ops do implicit upcasting anyway.
+    Rationale: it helps merge things like (1-x) and (1.0 - x).
+    """
+    if isinstance(node.op, T.Elemwise):
+        scalar_op = node.op.scalar_op
+        #print "aa", scalar_op.output_types_preference
+        if scalar_op.output_types_preference in (T.scal.upgrade_to_float, T.scal.upcast_out):
+            # this is the kind of op that we can screw with the input dtypes by upcasting
+            # explicitly
+            #print "HELLO??"
+            output_dtype = node.outputs[0].type.dtype
+            new_inputs = []
+            for i in node.inputs:
+                if i.type.dtype == output_dtype:
+                    new_inputs.append(i)
+                else:
+                    try:
+                        cval_i = get_constant_value(i)    # works only for scalars I think
+                        new_inputs.append(T.cast(cval_i, output_dtype))
+                    except:
+                        if isinstance(i, T.TensorConstant): #for the case of a non-scalar
+                            new_inputs.append(T.cast(i, output_dtype))
+                        else:
+                            new_inputs.append(i)
+            if new_inputs != node.inputs:
+                return [node.op(*new_inputs)]
 ##################
 # Subtensor opts #
 ##################
@@ -1717,6 +1754,7 @@ def local_greedy_distributor(node):
    return [rval]
 register_canonicalize(local_greedy_distributor)
+register_stabilize(local_greedy_distributor)
@@ -1748,6 +1786,7 @@ def constant_folding(node):
    return msg
 register_canonicalize(constant_folding)
+register_stabilize(constant_folding) # because 
 register_specialize(constant_folding)