merged after an hg pull

47d5b5a4 · Ian Goodfellow · 6bc8b1ee · 499b3da6 · 47d5b5a4 · 47d5b5a4
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -1303,7 +1303,7 @@ class _Maker(FunctionMaker): #inheritance buys a few helper functions
                            print >> infolog, 'trailing event in optimization', i, ':', j
                            print >> infolog, '   ', str(li[j])
                        elif li[j] != l0[j]:
-                            print 'non-equal optimization events', i, ':', j
+                            print >>infolog, 'non-equal optimization events', i, ':', j
                            print >>infolog, '   ', str(l0[j])
                            print >>infolog, '   ', str(li[j])
                            #print >> infolog, "* ", j,

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -963,9 +963,9 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                            lopt_change = self.process_node(env, node, lopt)
                            if lopt_change:
                                process_count[lopt] += 1
-                            else:
-                                process_count[lopt] += 0
-                            changed |= lopt_change
+                                changed = True
+                                if node not in env.nodes:
+                                    break# go to next node
            finally:
                self.detach_updater(env, u)
            self.detach_updater(env, u) #TODO: erase this line, it's redundant at best

--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -153,7 +153,7 @@ class PycudaElemwiseKernel(Op):
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
        i = inputs + z
        sp = splay(i[0].mem_size)
-        self.pycuda_fct(*i, grid=sp[0], block=sp[1])
+        self.pycuda_fct(*i)#, grid=sp[0], block=sp[1])

 pycuda_optimizer = EquilibriumDB()
 gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -113,6 +113,7 @@ if cuda_available:
            GpuSubtensor, GpuIncSubtensor, GpuFlatten, GpuShape, GpuAlloc,
            GpuJoin,fscalar, fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4
                           , scalar, vector, matrix, row, col, tensor3, tensor4)
+    from basic_ops import host_from_gpu, gpu_from_host
    import opt
    import cuda_ndarray


--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -37,7 +37,7 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
 class NaiveAlgo(object):
    verbose = 0 # 1, 2 or 3 for more verbose output.
    cache_version = ()
-    cache_version = ('debug', 10, verbose)
+    cache_version = ('debug', 11, verbose)

    def __init__(self, scalar_op, sync=True):
        """ 
@@ -834,7 +834,14 @@ nd_collapse_[i]=0;
        """ %locals()

        #check that all inputs have valid dimensions
-        for iname in inputs:
+        for id,iname in enumerate(inputs):
+            broadcasts = ', '.join(map(str,map(int,node.inputs[id].broadcastable)))
+            nd = node.inputs[id].ndim
+            print >> sio, """
+        int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
+""" %locals()
+        #check that all inputs have valid dimensions
+        for id,iname in enumerate(inputs):
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != %(iname)s->nd)
@@ -845,7 +852,7 @@ nd_collapse_[i]=0;
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
-            if ((CudaNdarray_HOST_DIMS(%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
+            if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -54,7 +54,8 @@ class InputToGpuOptimizer(Optimizer):
                try:
                    new_input = host_from_gpu(gpu_from_host(input))

-                    env.replace_validate(input, new_input, "To allow further optimisation to move Ops to gpu")
+                    if new_input.type==input.type:
+                        env.replace_validate(input, new_input, "To allow further optimisation to move Ops to gpu")
                except Exception, e:
                    #as we currently only support float32, this can fail. 
                    #Using try except make that we won't need 
@@ -136,10 +137,7 @@ def local_gpu_dimshuffle_0(node):
            # move the add to a GpuAdd
            new_op = GpuDimShuffle(node.op.input_broadcastable, 
                    node.op.new_order)
-            if node.op.inplace:
-                return [host_from_gpu(new_op(gpu_from_host(input)))]
-            else:
-                return [host_from_gpu(new_op(gpu_from_host(tensor.tensor_copy(input))))]
+            return [host_from_gpu(new_op(gpu_from_host(input)))]
    if node.op == gpu_from_host:
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op, tensor.DimShuffle):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -27,6 +27,9 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')

+def rand_cuda_ndarray(shape):
+    return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
+
 def tes_use():
    tcn.use()

@@ -206,6 +209,18 @@ def test_elemwise0():

    assert numpy.all(a0 + 1.0 == a.value)

+def test_elemwise_bad_broadcast():
+    x = cuda.fmatrix('x')
+    y = cuda.fmatrix('y')
+
+    f = theano.function([x, y], x * y)
+    try:
+        f(rand_cuda_ndarray((10, 3)), rand_cuda_ndarray((10, 1)))
+    except TypeError:
+        pass
+    else:
+        raise Exception("Theano should have raised an error")
+
 def test_elemwise1():
    """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """


--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -10,7 +10,7 @@ import numpy

 from theano import Op, Apply, shared, config, Variable
 from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
-from theano.tensor import zeros_like, sqrt, log, sin, cos, join
+from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
 from theano.compile import optdb
 from theano.gof import local_optimizer

@@ -556,6 +556,13 @@ class GPU_mrg_uniform(mrg_uniform_base):
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""

+    state_updates = []
+    """A list of pairs of the form (input_r, output_r), representing the
+    update rules of all the random states generated by this RandomStreams"""
+
+    def updates(self):
+        return list(self.state_updates)
+
    def __init__(self, seed=12345, use_cuda=None):
        """
        :type seed: int or list of 6 int.
@@ -612,7 +619,7 @@ class MRG_RandomStreams(object):

    def n_streams(self, size):
        # TODO: a smart way of choosing the number of streams
-        if isinstance(size, (tuple, list)):
+        if isinstance(size, (tuple, list)) and all([isinstance(i,int) for i in size]):
            r = 1
            for s in size:
                r *= s
@@ -627,6 +634,7 @@ class MRG_RandomStreams(object):
    def pretty_return(self, node_rstate, new_rstate, sample):
        sample.rstate = node_rstate
        sample.update = (node_rstate, new_rstate)
+        self.state_updates.append((node_rstate, new_rstate))
        node_rstate.default_update = new_rstate
        return sample

@@ -639,13 +647,12 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.
        
-        :param: size: Can be a list of integer or a Theano variable like the shape of some tensor.
-                      The number of dimensions must be computable at compile time.
+        :param: size: Can be a list of integer or Theano variable(ex: the shape of other Theano Variable)
                      TODO: can size be None?
        """
        if isinstance(size, tuple):
-            assert all([isinstance(i,int) for i in size]), "size must be a tuple of int or a Theano variable"
-        else: assert isinstance(size, Variable), "size must be a tuple of int or a Theano variable"
+            assert all([isinstance(i,int) or isinstance(i,Variable) for i in size]), "size must be a tuple of int or a Theano variable"
+        else: assert isinstance(size, Variable) and size.ndim==1, "size must be a tuple of int or a Theano variable"

        if nstreams is None:
            nstreams = self.n_streams(size)
@@ -706,24 +713,33 @@ class MRG_RandomStreams(object):
            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")

    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
+        """
+        :param: size: Can be a list of integer or Theano variable(ex: the shape of other Theano Variable)
+        """
        # We need an even number of ]0,1[ samples. Then we split them
        # in two halves. First half becomes our U1's for Box-Muller,
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

-        assert isinstance(size, tuple), "size must be a tuple"
-        assert all([isinstance(i,int) for i in size])
-        n_samples = numpy.prod(size)
        evened = False
+        constant = False
+        if isinstance(size, tuple) and all([isinstance(i,int) for i in size]):
+            constant = True          
+            n_samples = numpy.prod(size)
           
-        if n_samples % 2 == 1:
-            n_samples += 1
-            evened = True
-
+            if n_samples % 2 == 1:
+                n_samples += 1
+                evened = True
+        else:
+            n_samples = prod(size)+(prod(size)%2)#if even, don't change, if odd, +1
        flattened = self.uniform(size=(n_samples,), dtype=dtype)

-        U1 = flattened[:n_samples/2]
-        U2 = flattened[n_samples/2:]
+        if constant:
+            U1 = flattened[:n_samples/2]
+            U2 = flattened[n_samples/2:]
+        else:
+            U1 = flattened[:prod(flattened.shape)/2]
+            U2 = flattened[prod(flattened.shape)/2:]

        #normal_samples = zeros_like(flattened)
        sqrt_ln_U1 = sqrt(-2.0*log(U1))
@@ -740,8 +756,10 @@ class MRG_RandomStreams(object):
        final_samples = None
        if evened:
            final_samples = normal_samples[:-1]
-        else:
+        elif constant:
            final_samples = normal_samples
+        else:
+            final_samples = normal_samples[:prod(size)]

        final_samples = avg + std * final_samples


--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -433,53 +433,55 @@ def test_normal0():

    steps = 50
    if mode in ['DEBUG_MODE','FAST_COMPILE']:
-        sample_size = (99,30)
+        sample_size = (25,30)
        rtol=.02
    else:
        sample_size = (999,50)
        rtol=.01
+    sample_size_odd = (sample_size[0],sample_size[1]-1)
+    x = tensor.matrix()
+    for size,const_size,var_input,input in [(sample_size,sample_size,[],[]), (x.shape,sample_size,[x],[numpy.zeros(sample_size)]),
+                                 (sample_size_odd,sample_size_odd,[],[]),#test odd value
+                                 (x.shape,sample_size_odd,[x],[numpy.zeros(sample_size_odd)]),#test odd value
+                                 ]:
+        print ''
+        print 'ON CPU:'

-    print ''
-    print 'ON CPU:'
-
-    R = MRG_RandomStreams(234, use_cuda=False)
-    n = R.normal(size=sample_size, avg=-5.0, std=2.0)
-    f = theano.function([], n, mode=mode)
-    theano.printing.debugprint(f)
-    print 'random?[:10]\n', f()[0,0:10]
-    basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='mrg ', allow_01=True, mean_rtol=rtol)
+        R = MRG_RandomStreams(234, use_cuda=False)
+        n = R.normal(size=size, avg=-5.0, std=2.0)
+        f = theano.function(var_input, n, mode=mode)
+        theano.printing.debugprint(f)
+        print 'random?[:10]\n', f(*input)[0,0:10]
+        basictest(f, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol)

-    sys.stdout.flush()
+        sys.stdout.flush()

-    # now with odd number of samples
-    sample_size = (sample_size[0],sample_size[1]-1)
+        if mode!='FAST_COMPILE' and cuda_available:
+            print ''
+            print 'ON GPU:'
+            R = MRG_RandomStreams(234, use_cuda=True)
+            n = R.normal(size=size, avg=-5.0, std=2.0, dtype='float32')
+            assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
+            f = theano.function(var_input, theano.Out(
+                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
+                borrow=True), mode=mode_with_gpu)

-    if mode!='FAST_COMPILE' and cuda_available:
-        print ''
-        print 'ON GPU:'
-        R = MRG_RandomStreams(234, use_cuda=True)
-        n = R.normal(size=sample_size, avg=-5.0, std=2.0, dtype='float32')
-        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
-        f = theano.function([], theano.Out(
-            theano.sandbox.cuda.basic_ops.gpu_from_host(n),
-            borrow=True), mode=mode_with_gpu)
+            theano.printing.debugprint(f)
+            sys.stdout.flush()
+            print 'random?[:10]\n', numpy.asarray(f(*input))[0,0:10]
+            print '----'
+            sys.stdout.flush()
+            basictest(f, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True, inputs=input, mean_rtol=rtol)

-        theano.printing.debugprint(f)
-        sys.stdout.flush()
-        print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
-        print '----'
-        sys.stdout.flush()
-        basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True, mean_rtol=rtol)
-        

-    print ''
-    print 'ON CPU w NUMPY:'
-    RR = theano.tensor.shared_randomstreams.RandomStreams(234)
+        print ''
+        print 'ON CPU w NUMPY:'
+        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

-    nn = RR.normal(size=sample_size, avg=-5.0, std=2.0)
-    ff = theano.function([], nn)
+        nn = RR.normal(size=size, avg=-5.0, std=2.0)
+        ff = theano.function(var_input, nn)

-    basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True, mean_rtol=rtol)
+        basictest(ff, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)

 def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):


--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2705,6 +2705,8 @@ class Rebroadcast(Op):
            broadcast_pattern[k] = str(int(v))
        return '%s{%s}' % (self.__class__.__name__, ','.join(broadcast_pattern))
    def make_node(self, x):
+        if x.ndim <= numpy.max(self.axis.keys()):
+            raise ValueError('Trying to rebroadcast inexistant dimension')
        t = x.type.__class__(dtype = x.type.dtype,
                       broadcastable = [self.axis.get(i, b)
                                        for i, b in enumerate(x.type.broadcastable)])

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2010,6 +2010,42 @@ def check_for_x_over_absX(numerators, denominators):
    return numerators, denominators
 local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'X_over_absX')

+@register_canonicalize
+@gof.local_optimizer([T.abs_])
+def local_abs_lift(node):
+    """
+    move the abs toward the input. This is needed for check_for_x_over_absX to apply in more case.
+    
+    """
+    if node.op == T.abs_ and node.inputs[0].owner:
+        assert node.nin == 1
+        if node.inputs[0].owner.op == T.mul:
+            return [T.mul(*[T.abs_(i) for i in node.inputs[0].owner.inputs])]
+        if node.inputs[0].owner.op == T.true_div:
+            i = node.inputs[0].owner.inputs
+            return [T.true_div(T.abs_(i[0]),T.abs_(i[1]))]
+    
+@register_specialize
+@gof.local_optimizer([])
+def local_abs_merge(node):
+    """
+    merge abs generated by local_abs_lift when the canonizer don't need it anymore
+    
+    """
+    if node.op == T.mul and sum([i.owner.op == T.abs_ for i in node.inputs if i.owner])>1:
+        inputs = []
+        for i in node.inputs:
+            if i.owner and i.owner.op == T.abs_:
+                inputs.append(i.owner.inputs[0])
+            else:
+                const = get_constant_value(i)
+                if not (const>=0).all():
+                    return False
+                inputs.append(i)
+        return [T.abs_(T.mul(*inputs))]
+    if node.op == T.true_div and sum([i.owner.op == T.abs_ for i in node.inputs if i.owner])==2:
+        return [T.abs_(T.true_div(node.inputs[0].owner.inputs[0],node.inputs[1].owner.inputs[0]))]
+
 @register_stabilize
 @gof.local_optimizer([T.log])
 def local_log1p(node):
@@ -2279,7 +2315,12 @@ def local_elemwise_fusion_op(OP):
                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
                do_fusion=True
                try:
-                    s_input = [scalar.Scalar(x.dtype).make_variable() for x in i.owner.inputs]
+                    s_input = []
+                    for ii in i.owner.inputs:
+                        if ii in inputs:
+                            s_input.append(s_inputs[inputs.index(ii)])
+                        else:
+                            s_input.append(scalar.Scalar(ii.dtype).make_variable())
                    s_op=i.owner.op.scalar_op(*s_input)
                    i.owner.op.scalar_op.c_code(s_op.owner,"test_presence_of_c_code",
                                                ["x" for x in i.owner.inputs],
@@ -2298,8 +2339,11 @@ def local_elemwise_fusion_op(OP):
                s_inputs.extend(s_input)
                s_g.append(s_op)
            else:
+                if i in inputs:
+                    s=s_inputs[inputs.index(i)]
+                else:
+                    s=scalar.Scalar(i.dtype).make_variable()
                inputs.append(i)
-                s=scalar.Scalar(i.dtype).make_variable()
                s_inputs.append(s)
                s_g.append(s)

@@ -2308,6 +2352,21 @@ def local_elemwise_fusion_op(OP):
    #        print "local_elemwise_fusion: no elemwise in inputs. Nothing to fuse."
            return False

+        #remove duplicate inputs, we most keep the order.
+        inputs2=[]
+        s_inputs2=[]
+        for i,si in zip(inputs,s_inputs):
+            if i not in inputs2:
+                inputs2.append(i)
+                s_inputs2.append(si)
+            else:
+                assert si in s_inputs2
+        inputs = inputs2
+        s_inputs = s_inputs2
+        del inputs2, s_inputs2
+        assert len(s_inputs)==len(inputs)
+        
+
        otype = node.outputs[0].type
        s_new_out=node.op.scalar_op(*s_g)
        try:

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -27,7 +27,6 @@ utt.seed_rng()
 def inplace_func(inputs, outputs, mode=get_default_mode()):
    return function(inputs, outputs, mode=mode, accept_inplace=True)

-
 def eval_outputs(outputs):
    variables = inplace_func([], outputs)()
    if len(variables) == 1:
@@ -2611,48 +2610,55 @@ def test_autocast():
    finally:
        ac.__exit__()

-def test_unbroadcast_addbroadcast():
-    """
-    test that the unbroadcast fct don't insert not needed broadcast
-    and fuse consecutive Rebroadcast op
-    """
-
-    x=matrix()
-    assert unbroadcast(x,0) is x
-    assert unbroadcast(x,1) is x
-    assert unbroadcast(x,1,0) is x
-    assert unbroadcast(x,0,1) is x
-
-    assert addbroadcast(x,0) is not x
-    assert addbroadcast(x,1) is not x
-    assert addbroadcast(x,1,0).owner.inputs[0] is x
-
-    assert unbroadcast(addbroadcast(x,0),0) is x
-    assert addbroadcast(unbroadcast(x,0),0) is not x
-    x=row()
-    assert unbroadcast(x,0) is not x
-    assert unbroadcast(x,1) is x
-    assert unbroadcast(x,1,0) is not x
-    assert unbroadcast(x,0,1) is not x
-
-    assert addbroadcast(x,0) is x
-    assert addbroadcast(x,1).owner.inputs[0] is x
-    assert addbroadcast(x,1,0).owner.inputs[0] is x
-    assert addbroadcast(x,0,1).owner.inputs[0] is x
-
-    assert unbroadcast(addbroadcast(x,1),1) is x
-    assert addbroadcast(unbroadcast(x,1),1) is not x
-
-    #the first broadcast is remove the broadcast, so the second
-    #should not make one
-    assert unbroadcast(unbroadcast(x,0),0).owner.inputs[0] is x
-
-    #test that consecutive Rebroadcast op are fused
-    x=TensorType(dtype = 'float64', broadcastable = (True,True))()
-    assert unbroadcast(unbroadcast(x,1),0).owner.inputs[0] is x
-    assert addbroadcast(unbroadcast(x,1),0).owner.inputs[0] is x
-    assert addbroadcast(unbroadcast(x,0),0) is x
-
+class test_broadcast(unittest.TestCase):
+    def test_broadcast_bigdim(self):
+        def f():
+            x = matrix()
+            addbroadcast(x,2)
+        self.failUnlessRaises(ValueError, f)
+
+    def test_unbroadcast_addbroadcast(self):
+        """
+        test that the unbroadcast fct don't insert not needed broadcast
+        and fuse consecutive Rebroadcast op
+        """
+        
+        x=matrix()
+        assert unbroadcast(x,0) is x
+        assert unbroadcast(x,1) is x
+        assert unbroadcast(x,1,0) is x
+        assert unbroadcast(x,0,1) is x
+        
+        assert addbroadcast(x,0) is not x
+        assert addbroadcast(x,1) is not x
+        assert addbroadcast(x,1,0).owner.inputs[0] is x
+        
+        assert unbroadcast(addbroadcast(x,0),0) is x
+        assert addbroadcast(unbroadcast(x,0),0) is not x
+        x=row()
+        assert unbroadcast(x,0) is not x
+        assert unbroadcast(x,1) is x
+        assert unbroadcast(x,1,0) is not x
+        assert unbroadcast(x,0,1) is not x
+        
+        assert addbroadcast(x,0) is x
+        assert addbroadcast(x,1).owner.inputs[0] is x
+        assert addbroadcast(x,1,0).owner.inputs[0] is x
+        assert addbroadcast(x,0,1).owner.inputs[0] is x
+        
+        assert unbroadcast(addbroadcast(x,1),1) is x
+        assert addbroadcast(unbroadcast(x,1),1) is not x
+        
+        #the first broadcast is remove the broadcast, so the second
+        #should not make one
+        assert unbroadcast(unbroadcast(x,0),0).owner.inputs[0] is x
+
+        #test that consecutive Rebroadcast op are fused
+        x=TensorType(dtype = 'float64', broadcastable = (True,True))()
+        assert unbroadcast(unbroadcast(x,1),0).owner.inputs[0] is x
+        assert addbroadcast(unbroadcast(x,1),0).owner.inputs[0] is x
+        assert addbroadcast(unbroadcast(x,0),0) is x
+    
 def test_mod():
    """
    We add this test as not all language and C implementation give the same 

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -484,9 +484,67 @@ class test_canonize(unittest.TestCase):
                assert numpy.all(numpy.isfinite(out))
                assert numpy.allclose(out,numpy.sign(val_inputs[0]))
                assert(out_dtype==out.dtype)
+                assert len(f.maker.env.toposort())==1
+
+            #test (2*x) / (3*abs(x)) -> sign(x)
+            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
+                    ((2*dx)/(3*abs(dx)),[dx],[0.5-dxv],'float64'),
+                    ((2*fx)/(3*abs(fx)),[fx],[0.5-fxv],'float32'),
+                    ((2*dx)/(3*abs(dx)),[dx],[0.0*dxv],'float64'),
+                    ((2*fx)/(3*abs(fx)),[fx],[0.0*fxv],'float32'),
+                    ((2*dv)/(3*abs(dv)),[dv],[0.5-dvv],'float64'),
+                    ((2*fv)/(3*abs(fv)),[fv],[0.5-fvv],'float32'),
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                topo = f.maker.env.toposort()
+                out = f(*val_inputs)
+                assert numpy.all(numpy.isfinite(out))
+                assert numpy.allclose(out,numpy.sign(val_inputs[0])*2/3)
+                assert(out_dtype==out.dtype)
        finally:
            mode._optimizer = old_optimizer

+    def test_abs_mul_div(self):
+        """
+        test that if we have 
+        4 * x / abs(2*x) it get simplifier during canonicalisation.
+        """
+
+        x=T.dscalar()
+        a=T.abs_(x)
+        
+        if theano.config.mode=='FAST_COMPILE':
+            mode = theano.compile.mode.get_mode('FAST_RUN').excluding("local_elemwise_fusion")
+        else:
+            mode = theano.compile.mode.get_default_mode().excluding("local_elemwise_fusion")
+
+        f=theano.function([x],[(4*x)/abs(2*x)], mode = mode)
+        print f.maker.env.toposort()
+        print
+        f(.1)
+        f(-1)
+        #some stabilization optimization make the output be finite instead of nan
+        #debug_mode will raise an error when he see nan
+        if not isinstance(mode,theano.compile.debugmode.DebugMode):
+            assert numpy.isfinite(f(0))
+
+        assert len(f.maker.env.toposort())==2
+        assert f.maker.env.toposort()[0].op==T.sgn
+
+        f=theano.function([x],[(4*x)/abs(x/2)], mode = mode)
+        print f.maker.env.toposort()
+        print
+        f(.1)
+        f(-1)
+        #some stabilization optimization make the output be finite instead of nan
+        #debug_mode will raise an error when he see nan
+        if not isinstance(mode,theano.compile.debugmode.DebugMode):
+            assert numpy.isfinite(f(0))
+
+        assert len(f.maker.env.toposort())==2
+        assert f.maker.env.toposort()[0].op==T.sgn
+

    def test_multiple_case_that_fail(self):
        import theano.tensor, theano.compile
@@ -553,6 +611,30 @@ class test_canonize(unittest.TestCase):
        """
        raise SkipTest("Not implemented")

+def test_local_merge_abs():
+    x,y,z = T.matrices('xyz')
+    x_val = numpy.random.rand(5,5)
+    y_val = numpy.random.rand(5,5)
+    z_val = numpy.random.rand(5,5)
+    mode = theano.config.mode
+    if mode == "FAST_COMPILE":
+        mode = "FAST_RUN"
+    mode = theano.compile.mode.get_mode(mode).excluding("local_elemwise_fusion")
+
+    f = theano.function([x,y,z],(abs(y*z*-2)), mode=mode)
+    f(x_val,y_val,z_val)
+    theano.printing.debugprint(f)
+    assert isinstance(f.maker.env.toposort()[1].op.scalar_op, scal.Abs)
+    assert len(f.maker.env.toposort())==2
+
+    f = theano.function([x,y,z],abs(x/y), mode=mode)
+    f(x_val,y_val,z_val)
+    theano.printing.debugprint(f)
+    assert isinstance(f.maker.env.toposort()[1].op.scalar_op, scal.Abs)
+    assert len(f.maker.env.toposort())==2
+
+
+
 def test_mixeddiv():
    """Test that int division is preserved"""
    i = iscalar()
@@ -692,7 +774,7 @@ class test_fusion(unittest.TestCase):
            #TODO: BIT OP only with ints, xor, or, and, invert, cast
 #            (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
 #            (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
-            (theano.tensor.pow(fx*fy+fz,fx*fy),(fx,fy,fz),(fxv,fyv,fzv),2,numpy.power(fxv*fyv+fzv,fxv*fyv),'float32'),
+            (theano.tensor.pow(fx*fy+fz,fx*fy),(fx,fy,fz),(fxv,fyv,fzv),1,numpy.power(fxv*fyv+fzv,fxv*fyv),'float32'),
            (fv+fy**fz,(fv,fy,fz),(fvv,fyv,fzv),2,fvv+fyv**fzv,'float32'),#fused with a dimshuffle
            (fv-fy+tanh(fz),(fv,fy,fz),(fvv,fyv,fzv),2,fvv-fyv+numpy.tanh(fzv),'float32'),#fused with a dimshuffle
            ]