merge

2317ffbb · Ian Goodfellow · 5a72ab8b · 6b16e20c · 2317ffbb · 2317ffbb
--- a/doc/LICENSE.txt
+++ b/doc/LICENSE.txt
@@ -3,7 +3,7 @@
 LICENSE
 =======

-Copyright (c) 2008--2009, Theano Development Team
+Copyright (c) 2008--2011, Theano Development Team
 All rights reserved.

 Redistribution and use in source and binary forms, with or without

--- a/doc/internal/how_to_release.txt
+++ b/doc/internal/how_to_release.txt
@@ -25,6 +25,7 @@ Edit ``setup.py`` to contain the newest version number ::

 * Change the ``version`` and ``release`` variables to new version number.
 * Change the upper copyright year to the current year if necessary.
+ * Update the year in the Theano/LICENSE.txt file.

 ``NEWS.txt`` usually contains the name and date of the release, change them
 too.

--- a/doc/tutorial/shape_info.txt
+++ b/doc/tutorial/shape_info.txt
@@ -22,19 +22,85 @@ Currently shape informations are used for 2 things in Theano:
 .. code-block:: python

   import theano
-   x = theano.tensor.matrix()
+   x = theano.tensor.matrix('x')
   f = theano.function([x], (x**2).shape)
   theano.printing.debugprint(f)
-   # MakeVector [@26301776] ''   2
-   #  |Shape_i{0} [@26321296] ''   1
-   #  | |<TensorType(float64, matrix)> [@26153424]
-   #  |Shape_i{1} [@26322512] ''   0
-   #  | |<TensorType(float64, matrix)> [@26153424]
+   #MakeVector [@43860304] ''   2
+   # |Shape_i{0} [@43424912] ''   1
+   # | |x [@43423568]
+   # |Shape_i{1} [@43797968] ''   0
+   # | |x [@43423568]

 The output of this compiled function do not contain any multiplication
 or power. Theano has removed them to compute directly the shape of the
 output.

+Shape inference problem
+=======================
+
+Theano do shape information propagation in the graph. Sometimes this
+can had error. Example:
+
+.. code-block:: python
+   
+   import numpy
+   import theano
+   x = theano.tensor.matrix('x')
+   y = theano.tensor.matrix('y')
+   z = theano.tensor.join(0,x,y)
+   xv = numpy.random.rand(5,4)
+   yv = numpy.random.rand(3,3)
+
+   f = theano.function([x,y], z.shape)
+   theano.printing.debugprint(f)
+   #MakeVector [@23910032] ''   4
+   # |Elemwise{Add{output_types_preference=transfer_type{0}}}[(0, 0)] [@24055120] ''   3
+   # | |Shape_i{0} [@23154000] ''   1
+   # | | |x [@23151760]
+   # | |Shape_i{0} [@23593040] ''   2
+   # | | |y [@23151888]
+   # |Shape_i{1} [@23531152] ''   0
+   # | |x [@23151760]
+
+   #MakeVector [@56338064] ''   4
+   # |Elemwise{Add{output_types_preference=transfer_type{0}}}[(0, 0)] [@56483152] ''   3
+   # | |Shape_i{0} [@55586128] ''   1
+   # | | |<TensorType(float64, matrix)> [@55583888]
+   # | |Shape_i{0} [@56021072] ''   2
+   # | | |<TensorType(float64, matrix)> [@55584016]
+   # |Shape_i{1} [@55959184] ''   0
+   # | |<TensorType(float64, matrix)> [@55583888]
+
+   print f(xv,yv)# DONT RAISE AN ERROR AS SHOULD BE.
+   #[8,4]
+
+   f = theano.function([x,y], z)# Don't take the shape.
+   theano.printing.debugprint(f)
+   #Join [@44540496] ''   0
+   # |0 [@44540432]
+   # |x [@44540240]
+   # |y [@44540304]
+
+   f(xv,yv)
+   # Raise a dimensions mismatch error.
+
+As you see, when you ask for the shape of some computation(join in the
+example), we sometimes compute the shape without executing the
+computation(there is no join in the first output or debugprint).
+
+This make the computation of the shape faster, but can hide error. In
+the example, the computation of the shape of join is done on the first
+theano variable in the join, not on the other.
+
+This can probably happen with many other op as elemwise, dot, ...
+
+You can detect those problem by running the code without this
+optimization with the theano flag
+`optimizer_excluding=local_shape_to_shape_i`. You can also have the
+same effect by running in the mode FAST_COMPILE(won't apply this
+optimization and most other optimization too) or DEBUG_MODE(will test
+before and after all optimizations(much slower)).
+

 Specifing exact shape
 =====================

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -659,7 +659,9 @@ def _constructor_Function(maker, input_storage, inputs_data):
    f = maker.create(input_storage, trustme = True)
    assert len(f.input_storage) == len(inputs_data)
    for container, x in zip(f.input_storage, inputs_data):
-        assert (container.data is x) or (container.data == x)
+        assert (container.data is x) or \
+            (isinstance(x, numpy.ndarray) and (container.data == x).all()) or \
+            (container.data == x)
    return f

 copy_reg.pickle(Function, _pickle_Function)

--- a/theano/compile/tests/test_function_module.py
+++ b/theano/compile/tests/test_function_module.py
@@ -566,6 +566,36 @@ class T_picklefunction(unittest.TestCase):

        assert numpy.all(nl[6][nl[2]] == numpy.asarray([2, 3., 4]))

+    def test_broken_pickle_with_shared(self):
+        saves = []
+        def pers_save(obj):
+            if isinstance(obj, numpy.ndarray):
+                saves.append(obj)
+                return len(saves)-1
+            else:
+                return None
+        def pers_load(id):
+            return saves[id]
+
+        a = numpy.random.rand(4, 5)
+        b = numpy.random.rand(5, 4)
+        
+        x = theano.tensor.matrix()
+        y = theano.shared(b)
+        
+        f = theano.function([x], theano.tensor.dot(x, y))
+
+        import StringIO
+        fp = StringIO.StringIO()
+        p = cPickle.Pickler(fp, 2)
+        p.persistent_id = pers_save
+        p.dump(f)
+        fp2 = StringIO.StringIO(fp.getvalue())
+        fp.close()
+        p = cPickle.Unpickler(fp2)
+        p.persistent_load = pers_load
+        f2 = p.load()
+        fp2.close()

    def test_pickle_class_with_functions(self):


--- a/theano/gradient.py
+++ b/theano/gradient.py
-"""Driver for general gradient calculations."""
+"""Driver for gradient calculations."""
+
+__authors__   = "James Bergstra"
+__copyright__ = "(c) 2011, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"

 __docformat__ = "restructuredtext en"

+import logging
+_logger = logging.getLogger('theano.gradient')
 import sys
-import gof #, gof.variable
+
 import numpy #for numeric_grad

+import gof #, gof.variable
 from gof.python25 import all
 import gof.utils

-import logging
-_logger = logging.getLogger('theano.gradient')
+from raise_op import Raise
+
 def warning(*msg):
    _logger.warning('WARNING theano.gradient: '+' '.join(msg))
 def info(*msg):
@@ -106,4 +114,14 @@ def grad_sources_inputs(sources, graph_inputs, warn_type=True):
                    gmap[r] = g_r
    return gmap

+def unimplemented_grad(op, x_pos, x):
+    """
+    Return an un-computable symbolic variable of type `x.type`.
+
+    If any function tries to compute this un-computable variable, an exception
+    (NotImplementedError) will be raised indicating that the gradient on the
+    `x_pos`'th input of `op` has not been implemented.

+    """
+    msg = '%s.grad not implemented for input %i'%(op, x_pos)
+    return Raise(msg=msg)(x)
--- a/theano/misc/latence_gpu_transfert.py
+++ b/theano/misc/latence_gpu_transfert.py
+import time
+
+import numpy
+
+import theano
+
+y = theano.tensor.fvector()
+x = theano.shared(numpy.zeros(1,dtype='float32'))
+f1 = theano.function([y],updates={x:y})
+f2 = theano.function([],theano.sandbox.cuda.host_from_gpu(x))
+print f1.maker.env.toposort()
+print f2.maker.env.toposort()
+for i in [1,10,100,1000, 10000, 100000,1000000, 10000000]:
+    o = numpy.zeros(i, dtype='float32')
+    t0=time.time();f1(o);t1=time.time();
+    tf1=t1-t0
+    t0=time.time();f2();t1=time.time();
+
+    print  "%8i %6.1f ns %7.1f ns"%(i, tf1*1e6,(t1-t0)*1e6)
--- a/theano/printing.py
+++ b/theano/printing.py
@@ -669,9 +669,10 @@ def pydotprint_variables(vars,
        else:
            #a var id is needed as otherwise var with the same type will be merged in the graph.
            varstr = str(var.type)
-        if len(dstr) > max_label_size:
-            dstr = dstr[:max_label_size-1]+'...'
+
        varstr += ' ' + str(len(var_str))
+        if len(varstr) > max_label_size:
+            varstr = varstr[:max_label_size-3]+'...'
        var_str[var]=varstr
        return varstr


--- a/theano/raise_op.py
+++ b/theano/raise_op.py
+"""Symbolic Op for raising an exception."""
+
+__authors__   = "James Bergstra"
+__copyright__ = "(c) 2011, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
+
+__docformat__ = "restructuredtext en"
+import gof
+
+class Raise(gof.Op):
+    """Op whose perform() raises an exception.
+    """
+    def __init__(self, msg="", exc=NotImplementedError):
+        """
+        msg - the argument to the exception
+        exc - an exception class to raise in self.perform
+        """
+        self.msg = msg
+        self.exc = exc
+    def __eq__(self, other):
+        # Note: the msg does not technically have to be in the hash and eq
+        # because it doesn't affect the return value.
+        return (type(self) == type(other)
+                and self.msg == other.msg
+                and self.exc == other.exc)
+    def __hash__(self):
+        return hash((type(self), self.msg, self.exc))
+    def __str__(self):
+        return "Raise{%s(%s)}"%(self.exc, self.msg)
+    def make_node(self, x):
+        return gof.Apply(self, [x], [x.type()])
+    def perform(self, node, inputs, out_storage):
+        raise self.exc(self.msg)
+
+
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -32,7 +32,7 @@ gpu_seqopt = SequenceDB()
 gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
        'fast_run', 'inplace')
 gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
-        'fast_run', 'inplace')
+        'fast_run', 'gpu')
 optdb.register('gpu_opt',
               gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -815,11 +815,11 @@ class MRG_RandomStreams(object):
        else:
            final_samples = normal_samples[:prod(size)]

-        final_samples = avg + std * final_samples
-
        if size:
            final_samples = final_samples.reshape(size)

+        final_samples = avg + std * final_samples
+
        return final_samples

 @local_optimizer([None])

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -294,21 +294,29 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[],
        ival = numpy.asarray(ival)
        if i == 0:
            mean = numpy.array(ival, copy=True)
-            avg_std = numpy.std(ival)
+            #avg_std = numpy.std(ival)
+            avg_std = numpy.sqrt(numpy.mean((ival - target_avg)**2))
            min_ = ival.min()
            max_ = ival.max()
        else:
            alpha = 1.0 / (1+i)
            mean = alpha * ival + (1-alpha)*mean
-            avg_std = alpha * numpy.std(ival) + (1-alpha)*avg_std
+            #avg_std = alpha * numpy.std(ival) + (1-alpha)*avg_std
+            avg_std = alpha * numpy.sqrt(numpy.mean((ival - target_avg)**2)) + (1-alpha)*avg_std
            min_ = min(min_,ival.min())
            max_ = max(max_,ival.max())
        if not allow_01:
            assert min_ > 0
            assert max_ < 1

-    print prefix, 'mean', numpy.mean(mean)
-    assert abs(numpy.mean(mean) - target_avg) < mean_rtol, 'bad mean? %f %f'%(numpy.mean(mean), target_avg)
+    if hasattr(target_avg, 'shape'): # looks if target_avg is an array
+        diff = numpy.mean(abs(mean - target_avg))
+        print prefix, 'mean diff with mean', diff
+        assert diff < mean_rtol, 'bad mean? %f %f' % (mean, target_avg)
+    else: # if target_avg is a scalar, then we can do the mean of `mean` to get something more precise
+        mean = numpy.mean(mean)
+        print prefix, 'mean', mean
+        assert abs(mean - target_avg) < mean_rtol, 'bad mean? %f %f'%(numpy.mean(mean), target_avg)
    print prefix, 'std', avg_std
    if target_std is not None:
        assert abs(avg_std - target_std) < .01, 'bad std? %f %f'%(avg_std, target_std)
@@ -450,30 +458,32 @@ def test_binomial():
 def test_normal0():

    steps = 50
+    std = 2.
    if mode in ['DEBUG_MODE','DebugMode','FAST_COMPILE']:
        sample_size = (25,30)
-        rtol=.02
+        default_rtol=.02
    else:
        sample_size = (999,50)
-        rtol=.01
+        default_rtol=.01
    sample_size_odd = (sample_size[0],sample_size[1]-1)
    x = tensor.matrix()
-    for size, const_size, var_input, input in [
-            (sample_size, sample_size, [], []),
-            (x.shape, sample_size, [x], [numpy.zeros(sample_size, dtype=config.floatX)]),
-            (sample_size_odd, sample_size_odd, [], []),#test odd value
-            (x.shape, sample_size_odd, [x], [numpy.zeros(sample_size_odd, dtype=config.floatX)]),#test odd value
+    for size, const_size, var_input, input, avg, rtol in [
+            (sample_size, sample_size, [], [], -5., default_rtol),
+            (x.shape, sample_size, [x], [numpy.zeros(sample_size, dtype=config.floatX)], -5., default_rtol),
+            (sample_size_odd, sample_size_odd, [], [], -5., default_rtol),#test odd value
+            (x.shape, sample_size_odd, [x], [numpy.zeros(sample_size_odd, dtype=config.floatX)], -5., default_rtol),#test odd value
+            (sample_size, sample_size, [], [], numpy.arange(numpy.prod(sample_size), dtype='float32').reshape(sample_size), 10.*std/numpy.sqrt(steps)),
            ]:
        print ''
        print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
-        n = R.normal(size=size, avg=-5.0, std=2.0)
+        n = R.normal(size=size, avg=avg, std=std)
        f = theano.function(var_input, n, mode=mode)
        theano.printing.debugprint(f)
        out  = f(*input)
        print 'random?[:10]\n', out[0,0:10]
-        basictest(f, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol)
+        basictest(f, steps, const_size, target_avg=avg, target_std=std, prefix='mrg ', allow_01=True, inputs=input, mean_rtol=rtol)

        sys.stdout.flush()

@@ -481,7 +491,7 @@ def test_normal0():
            print ''
            print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
-            n = R.normal(size=size, avg=-5.0, std=2.0, dtype='float32')
+            n = R.normal(size=size, avg=avg, std=std, dtype='float32')
            assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
@@ -493,7 +503,7 @@ def test_normal0():
            print 'random?[:10]\n', gpu_out[0,0:10]
            print '----'
            sys.stdout.flush()
-            basictest(f, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True, inputs=input, mean_rtol=rtol)
+            basictest(f, steps, const_size, target_avg=avg, target_std=std, prefix='gpu mrg ', allow_01=True, inputs=input, mean_rtol=rtol)
            # Need to allow some rounding error as their is float
            # computation that are done on the gpu vs cpu
            assert numpy.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)
@@ -503,10 +513,10 @@ def test_normal0():
        print 'ON CPU w NUMPY:'
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

-        nn = RR.normal(size=size, avg=-5.0, std=2.0)
+        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

-        basictest(ff, steps, const_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)
+        basictest(ff, steps, const_size, target_avg=avg, target_std=std, prefix='numpy ', allow_01=True, inputs=input, mean_rtol=rtol)

 def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):


--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
-import operator
+"""
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+WARNING
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+This directory is for the internal of Theano.
+
+You are strongly adviced to don't use it except if you know
+what you do!
+
+If you want to use scalar variable in a Theano graph,
+you probably want to use theano.tensor.[c,z,f,d,b,w,i,l,]scalar!
+"""
+
 import math
 from copy import copy

@@ -584,8 +597,8 @@ class ScalarOp(Op):
            if not isinstance(variables, (list, tuple)) or any(not isinstance(x, Type) for x in variables):
                raise TypeError("output_types_preference should return a list or a tuple of types", self.output_types_preference, variables)
            if len(variables) != self.nout:
-                raise TypeError("Not the right number of outputs produced for %s(%s) by %s. Expected %s, got ?s."
-                                % (self, ", ".join(str(input.type) for input in inputs),
+                raise TypeError("Not the right number of outputs types produced for %s(%s) by %s. Expected %s, got %s."
+                                % (self, ", ".join(str(type) for type in variables),
                                   self.output_types_preference, self.nout, len(variables)))
            return variables
        else:

--- a/theano/scan_module/__init__.py
+++ b/theano/scan_module/__init__.py
@@ -81,7 +81,8 @@ optdb.register( 'scanOp_make_inplace'
               , opt.in2out(scan_make_inplace,ignore_newtrees=True)
               , 75
               , 'fast_run'
-               , 'inplace')
+               , 'inplace'
+               , 'scan')



@@ -512,7 +513,8 @@ class ScanSaveMem(gof.Optimizer):
 optdb.register( 'scanOp_save_mem'
               , ScanSaveMem()
               , 1.99
-               , 'fast_run')
+               , 'fast_run'
+               , 'scan')

 '''
 class ScanMerge(gof.Optimizer):
@@ -584,7 +586,8 @@ class ScanMerge(gof.Optimizer):
 optdb.register( 'scanOp_merge'
               , ScanMerge()
               , 2.39
-               , 'fast_run')
+               , 'fast_run'
+               , 'scan')
 '''


@@ -620,7 +623,7 @@ if cuda.cuda_available:
            return x


-    @register_opt()
+    @register_opt('scan')
    @local_optimizer([])
    def gpuScanOptimization(node):
        """
@@ -633,7 +636,19 @@ if cuda.cuda_available:
            host_input = node.inputs[0]
            if (host_input.owner and
                isinstance(host_input.owner.op, scan_op.Scan) and
-                not host_input.owner.op.info['gpu']):
+                not host_input.owner.op.info['gpu'] and
+                len(host_input.owner.outputs) == 1 ):
+                # Note that we are not doing the right thing here !!
+                # This is because the local optimizer expects only one
+                # output that corresponds to the input of ``node``
+                # If we do this for each output seperately we will have
+                # multiple scan ops in the graph ( as many as outputs )
+                # and I'm not sure they will get merged into one again
+                # So for now I will just cover a limited case when there
+                # is only one output and the local optimizer can be used
+                # TODO (fix) : either make sure the different scans get
+                # merged or implement this optimization as a global
+                # optimization
                thescan = host_input.owner.op
                info = thescan.info.copy()
                info['gpu'] = True
@@ -646,7 +661,7 @@ if cuda.cuda_available:
                     + thescan.n_shared_outs)
                nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
                b = e
-                e = e + thescan.n_nit_sot + thescan.n_other_ignore
+                e = e + thescan.n_nit_sot
                nw_ins += inputs[b:e]
                nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]
                scan_ins = [ tensor_to_cuda(x) for x in thescan.inputs]
@@ -678,7 +693,7 @@ if cuda.cuda_available:
                     + thescan.n_shared_outs)
                nw_ins += [safe_to_gpu(x) for x in inputs[1:e] ]
                b = e
-                e = e + thescan.n_nit_sot + thescan.n_other_ignore
+                e = e + thescan.n_nit_sot
                nw_ins += inputs[b:e]
                nw_ins += [safe_to_gpu(x) for x in inputs[e:] ]


--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -759,23 +759,14 @@ def scan( fn
            nit_sot_rightOrder.append( i )
            n_nit_sot += 1

-    ## Step 5.5 Sequences with no taps used
-    n_other_ignore    = 0
-    ignore_scan_seqs  = []
-    ignore_inner_seqs = []
-    for i,seq in enumerate(seqs):
-        if not 'taps' in seq:
-            ignore_scan_seqs.append(seq['input'])
-            n_other_ignore += 1
-
-    ## Step 5.6 all other arguments including extra inputs
+    ## Step 5.5 all other arguments including extra inputs
    other_scan_args  = []
    other_inner_args = []

    other_scan_args  += [ arg for arg in non_seqs
                        if not isinstance(arg, SharedVariable) ]

-    ## Step 5.8 all shared variables with no update rules
+    ## Step 5.6 all shared variables with no update rules
    def new_variable( v ):
        new_v = safe_new(v)
        if v.name:
@@ -805,7 +796,6 @@ def scan( fn
                    mit_sot_inner_inputs    +
                    sit_sot_inner_inputs    +
                    shared_inner_inputs     +
-                    ignore_inner_seqs       +
                    other_shared_inner_args +
                    other_inner_args        )

@@ -850,7 +840,6 @@ def scan( fn
    info['n_sit_sot']          = n_sit_sot
    info['n_shared_outs']      = n_shared_outs
    info['n_nit_sot']          = n_nit_sot
-    info['n_other_ignore']     = n_other_ignore
    info['truncate_gradient']  = truncate_gradient
    info['name']               = name
    info['mode']               = mode
@@ -876,7 +865,6 @@ def scan( fn
                   sit_sot_scan_inputs                           +
                   shared_scan_inputs                            +
                   [ actual_n_steps for x in xrange(n_nit_sot) ] +
-                   ignore_scan_seqs                              +
                   other_shared_scan_args                        +
                   other_scan_args                               )


--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -754,7 +754,6 @@ def compress_outs(op, not_required, inputs):
    info['n_sit_sot']          = 0
    info['n_shared_outs']      = 0
    info['n_nit_sot']          = 0
-    info['n_other_ignore']     = op.info['n_other_ignore']
    info['truncate_gradient']  = op.info['truncate_gradient']
    info['name']               = op.info['name']
    info['inplace']            = op.info['inplace']

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -242,7 +242,11 @@ class T_Scan(unittest.TestCase):
        W_in = theano.tensor.fscalar('win')
        W    = theano.tensor.fscalar('w')

-        mode = theano.compile.mode.get_default_mode().including('gpu')
+        if theano.config.mode == 'FAST_COMPILE':
+            mode = theano.compile.mode.get_mode('FAST_RUN')
+        else:
+            mode = theano.compile.mode.get_default_mode()
+        mode = mode.including('gpu','scan')
        # The following line is needed to have the first case being used
        # Otherwise, it is the second that is tested.
        mode = mode.excluding('InputToGpuOptimizer')
@@ -314,7 +318,11 @@ class T_Scan(unittest.TestCase):
        x0   = theano.tensor.fscalar('x0')
        W_in = theano.tensor.fscalar('win')
        W    = theano.tensor.fscalar('w')
-        mode = theano.compile.mode.get_default_mode().including('gpu')
+        if theano.config.mode == 'FAST_COMPILE':
+            mode = theano.compile.mode.get_mode('FAST_RUN')
+        else:
+            mode = theano.compile.mode.get_default_mode()
+        mode = mode.including('gpu','scan')
        output, updates = theano.scan(f_rnn, u,x0,[W_in,W]
                                      , n_steps           = None
                                      , truncate_gradient = -1
@@ -1980,7 +1988,8 @@ class T_Scan(unittest.TestCase):
        self.assertTrue(nb_scan == 2)
        nb_shape_i = len([n for n in topo
            if isinstance(n.op, theano.tensor.opt.Shape_i)])
-        self.assertTrue(nb_shape_i == 1)
+        if theano.config.mode != 'FAST_COMPILE':
+            self.assertTrue(nb_shape_i == 1)

    def test_bug_josh_reported(self):
        import theano
@@ -1990,6 +1999,16 @@ class T_Scan(unittest.TestCase):
        conv = theano.tensor.signal.conv.conv2d(m1, m2)


+    def test_hash(self):
+        x = theano.tensor.vector()
+        y = theano.tensor.vector()
+        scan1,updates = theano.scan(lambda _x:_x+1, x )
+        scan2,updates = theano.scan(lambda _x:_x+1, y )
+        assert scan1.owner.op == scan2.owner.op
+        assert hash(scan1.owner.op) == hash(scan2.owner.op)
+
+
+
 if __name__ == '__main__':
    #'''
    print ' Use nosetests to run these tests '

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3292,7 +3292,16 @@ class Rebroadcast(Op):
        # restore the broadcasting pattern of the input
        return Rebroadcast(*[(axis, x.type.broadcastable[axis]) for axis, value in self.axis.iteritems()])(gz),
    def infer_shape(self, node, ishapes):
-        return ishapes
+        assert len(ishapes)==1
+        l = []
+        one = constant(1)
+        for ax in range(len(ishapes[0])):
+            if self.axis.get(ax, False):
+                l.append(one)
+            else:
+                l.append(ishapes[0][ax])
+
+        return [tuple(l)]

 def addbroadcast(x, *axes):
    """
@@ -3477,6 +3486,51 @@ class Join(Op):
        else:
            return node.owner.tag.shape_zero

+
+    def infer_shape(self, node, ishapes):
+        # Join op should get at least two inputs to join
+        assert len(ishapes) > 1
+        # Not sure this is needed anymore :( ... basically the apply_shape
+        # version of the apply node (i.e. the one defined in
+        # gof/apply_shape) calls infer_shape methods passing None to unknown
+        # inputs. It can handle NotImplementedError, so for now I just raise
+        # that whenever I get a None. Should we just remove gof/apply_shape
+        # if it is depricated ??
+        if ishapes[1] is None:
+            raise NotImplementedError
+        n_dim = len(ishapes[1])
+        for shape in ishapes[1:]:
+            if shape is None:
+                raise NotImplementedError
+            for shape_i in shape:
+                if shape_i is None:
+                    raise NotImplementedError
+            # at this point the inputs have been broadcasted so they should
+            # all have the same shape
+            assert len(shape) == n_dim
+
+        out_shapes = []
+        for dim in xrange(n_dim):
+            # we have to deal with 2 possible cases in here :
+            #   a) we are dealing with the dimension for which we join
+            #     (called t_side from true side of the if, where the if
+            #     compares current dimension with the joining dimension)
+            #   b) a non joining dimension ( in which maybe a symbolic
+            #      assertion can be used to make sure all tensors have
+            #      the same number of elements on this non-joined dimension
+            #      this is f_side
+            # initialize
+            t_side = ishapes[1][dim]
+            f_side = ishapes[1][dim]
+            # loop over tensors and sum for the joining dimension
+            for shape in ishapes[2:]:
+                t_side = t_side + shape[dim]
+            # return the dimensions found
+            out_shapes.append( switch(eq(dim, node.inputs[0]),
+                              t_side, f_side))
+
+        return [tuple(out_shapes)]
+
 @_redefine_asRoutine(Join())
 def join(axis, *tensors):
    """
@@ -4622,7 +4676,8 @@ outer = Outer()
 # Gradient
 #########################

-def grad(cost, wrt, g_cost=None, consider_constant=[], warn_type=False):
+def grad(cost, wrt, g_cost=None, consider_constant=[], warn_type=False,
+         assume_continuously_differentiable = False):
    """
    :type cost: Scalar (0-dimensional) `Variable`
    :type wrt: `Variable` or list of `Variable`s.
@@ -4634,6 +4689,14 @@ def grad(cost, wrt, g_cost=None, consider_constant=[], warn_type=False):
    :param warn_type: a value of True will cause warnings to be logged for any Op that emits a
        gradient that does not match its input type.

+    :param assume_continuously_differentiable : flag that says if grad is strict about what it returns.
+        If set to false it will raise an exception for any argument in
+        ``wrt`` for which there is no gradient either because some op does
+        not know how to compute the gradient with respect to that argument
+        or the argument is not part of the computational graph. If the flag
+        is set to true, the ``grad`` method returns zeros like the argument
+        ( i.e. it makes the assumption that the gradient should be 0).
+
    :rtype: `Variable` or list of `Variable`s (depending upon `wrt`)

    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
@@ -4663,13 +4726,30 @@ def grad(cost, wrt, g_cost=None, consider_constant=[], warn_type=False):
            list(inputs) + list(consider_constant),
            warn_type=warn_type)

-    # Note that it is important to use `zeros_like` when there is no gradient,
-    # instead of returning a scalar constant equal to zero. Otherwise we lose
-    # the guarantee that the gradient has same shape as `wrt`.
-    if isinstance(wrt, (list, tuple)):
-        return [gmap.get(p, zeros_like(p)) for p in wrt]
+
+    # Note : If p is not in gmap there can be several reasons, among which
+    # is the fact that p might not be part of the computational graph. A
+    # simple example is that for a+b for e.g. a[0] is not part of the graph,
+    # so Theano does not know how to compute TT.grad(TT.sum(a+b), a[0])
+    # such subtle cases can be fixed by a more careful implementation of the
+    # gradient, but for now Theano needs to throw an exception, and make the
+    # user aware that it does not know how to compute that gradient
+    if not isinstance(wrt, (list, tuple)):
+        wrt = [wrt]
+    ret = []
+    for p in wrt:
+        if p not in gmap and not assume_continuously_differentiable:
+            raise ValueError(("grad method was asked to compute the graident "
+                             "with respect to a variable that is not part of "
+                             "the computational graph of the cost or is used "
+                             "by a non-differentiable operator "),p)
+        else:
+            ret.append(gmap.get(p, zeros_like(p)))
+
+    if len(ret) == 1:
+        return ret[0]
    else:
-        return gmap.get(wrt, zeros_like(wrt))
+        return ret

 class numeric_grad:
    """WRITEME"""
@@ -4938,7 +5018,8 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    if cast_to_output_type:
        g_cost = cast(g_cost, o_output.dtype)

-    symbolic_grad = grad(cost, tensor_pt, g_cost)
+    symbolic_grad = grad(cost, tensor_pt, g_cost,
+                         assume_continuously_differentiable = True)
    #if o_output.dtype in ['float32','float64']:
    #    assert all([x.dtype == o_output.dtype for x in symbolic_grad]),("Expected grad of type %s, got %s "%( symbolic_grad.dtype, o_output.dtyp))


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -2742,10 +2742,10 @@ def local_add_specialize(node):
            new_inputs.append(input)

        if len(new_inputs) < len(node.inputs):
+            dtype = node.outputs[0].type.dtype
            if len(new_inputs) == 0:
                #we got rid of the entire expression!
                ndim = node.outputs[0].type.ndim
-                dtype = node.outputs[0].type.dtype
                return fill_chain(
                        T.TensorConstant(
                            T.TensorType(
@@ -2754,9 +2754,14 @@ def local_add_specialize(node):
                            numpy.zeros((1,)*ndim, dtype=dtype)))

            if len(new_inputs) == 1:
-                return fill_chain(new_inputs[0])
+                ret = fill_chain(new_inputs[0])
            else:
-                return fill_chain(T.add(*new_inputs))
+                ret = fill_chain(T.add(*new_inputs))
+            # The dtype should not be changed. It can happen if the input
+            # that was forcing upcasting was equal to 0.
+            if ret[0].dtype != dtype:
+                ret = [T.cast(ret[0], dtype)]
+            return ret
    else:
        return False
 register_specialize(local_add_specialize)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2447,6 +2447,42 @@ class T_Join_and_Split(unittest.TestCase):
        self.assertRaises(ValueError, g, a_val, b_val, c_val, bad_d_val, e_val)
        self.assertRaises(ValueError, g, a_val, b_val, c_val, d_val, bad_e_val)

+    def test_infer_shape_join(self):
+        x1 = matrix()
+        x2 = matrix()
+        x3 = matrix()
+
+        def get_mat(s1,s2):
+            return numpy.asarray( numpy.random.uniform(size=(s1,s2)),
+                                 dtype= config.floatX)
+
+        # Test dim 0
+        z = join(0,x1,x2,x3)
+        f = theano.function([x1,x2,x3], z.shape)
+        out = f( get_mat(3,4), get_mat(2,4), get_mat(1,4))
+        assert (out == [6,4]).all()
+
+        if theano.config.mode != 'FAST_COMPILE':
+            for node in f.maker.env.toposort():
+                assert not isinstance(node.op, tensor.Join)
+
+        # Test dim 1
+        z = join(1,x1,x2,x3)
+        f = theano.function([x1,x2,x3], z.shape)
+        out = f( get_mat(3,4), get_mat(3,4), get_mat(3,5))
+        assert (out == [3,13]).all()
+
+        if theano.config.mode != 'FAST_COMPILE':
+            for node in f.maker.env.toposort():
+                assert not isinstance(node.op, tensor.Join)
+
+        # Test hide error
+        if theano.config.mode in ['DebugMode', 'DEBUG_MODE', 'FAST_COMPILE']:
+            self.assertRaises(ValueError, f, get_mat(3,4), get_mat(3,4), get_mat(2,5))
+        else:
+            f(get_mat(3,4), get_mat(3,4), get_mat(2,5))
+
+

 class test_comparison(unittest.TestCase):
    def test_gt(self):
@@ -3198,7 +3234,8 @@ class test_grad(unittest.TestCase):
        """grad: Test returning a single zero value from grad"""
        o = test_grad.O()
        a1 = o.make_node()
-        g = grad(a1.outputs[0], a1.outputs[1])
+        g = grad(a1.outputs[0], a1.outputs[1],
+                 assume_continuously_differentiable = True)
        self.assertTrue(g.owner.op == fill)
        self.assertTrue(g.owner.inputs[1].data == 0)
        try:
@@ -3211,7 +3248,8 @@ class test_grad(unittest.TestCase):
        """grad: Test returning some zero value from grad"""
        o = test_grad.O()
        a1 = o.make_node()
-        g0,g1,g2 = grad(a1.outputs[0], a1.inputs + [scalar('z')])
+        g0,g1,g2 = grad(a1.outputs[0], a1.inputs + [scalar('z')],
+                        assume_continuously_differentiable = True)
        self.assertTrue(o.gval0 is g0)
        self.assertTrue(o.gval1 is g1)
        self.assertTrue(g2.owner.op == fill)
@@ -3220,7 +3258,8 @@ class test_grad(unittest.TestCase):
    def test_zero_gradient_shape(self):
        """Ensure that a zero gradient has the proper shape."""
        x = dmatrix()
-        f = theano.function([x], grad(dscalar(), x))
+        f = theano.function([x], grad(dscalar(), x,
+                                      assume_continuously_differentiable= True))
        a = numpy.ones((3, 7))
        self.assertTrue((f(a) == 0).all())  # Zero gradient.
        self.assertTrue(a.shape == f(a).shape)  # With proper shape.
@@ -4158,9 +4197,19 @@ class test_broadcast(unittest.TestCase):

    def test_infer_shape(self):
        x = matrix()
-        y = addbroadcast(x,0)
+        y = addbroadcast(x, 0)
+        f = theano.function([x], y.shape)
+        assert (f(numpy.zeros((1,5), dtype=config.floatX)) == [1,5]).all()
+        topo = f.maker.env.toposort()
+        if theano.config.mode != 'FAST_COMPILE':
+            assert len(topo) == 2
+            assert isinstance(topo[0].op, opt.Shape_i)
+            assert isinstance(topo[1].op, opt.MakeVector)
+
+        x = matrix()
+        y = unbroadcast(x, 0)
        f = theano.function([x], y.shape)
-        f(numpy.zeros((1,5), dtype=config.floatX))
+        assert (f(numpy.zeros((2,5), dtype=config.floatX)) == [2,5]).all()
        topo = f.maker.env.toposort()
        if theano.config.mode != 'FAST_COMPILE':
            assert len(topo) == 3
@@ -4168,6 +4217,18 @@ class test_broadcast(unittest.TestCase):
            assert isinstance(topo[1].op, opt.Shape_i)
            assert isinstance(topo[2].op, opt.MakeVector)

+        x = row()
+        y = unbroadcast(x, 0)
+        f = theano.function([x], y.shape)
+        assert (f(numpy.zeros((1,5), dtype=config.floatX)) == [1,5]).all()
+        topo = f.maker.env.toposort()
+        if theano.config.mode != 'FAST_COMPILE':
+            assert len(topo) == 2
+            assert isinstance(topo[0].op, opt.Shape_i)
+            assert isinstance(topo[1].op, opt.MakeVector)
+
+
+
 def test_mod():
    """
    We add this test as not all language and C implementation give the same

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -787,13 +787,18 @@ class test_fusion(unittest.TestCase):
            (fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
            (fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),#55
            (fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
-#            (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
-
-            #TODO: BIT OP only with ints, xor, or, and, invert, cast
-#            (fx-theano.tensor.or_(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy|fz),'float32'),
-#            (fx-theano.tensor.xor(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fy^fz),'float32'),
+            (fx-fy+theano.tensor.round(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),
+            (ix-iy+theano.tensor.iround(fz),(ix,iy,fz),(ixv,iyv,fzv),1,ixv-iyv+numpy.round(fzv),'int64'),
+            # Bit op
+            (fx-theano.tensor.or_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv|izv),'float64'),
+            (fx-theano.tensor.xor(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv^izv),'float64'),#60
+            (fx-theano.tensor.and_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv&izv),'float64'),
+            (fx-theano.tensor.invert(iy),(fx,iy),(fxv,iyv),1,fxv-(~iyv),'float64'),
+
+            (fx-theano.tensor.cast(fy,dtype='float64'),(fx,fy),(fxv,fyv),1,
+                              fxv-numpy.asarray(fyv,'float64'),'float64'),
            (theano.tensor.pow(fx*fy+fz,fx*fy),(fx,fy,fz),(fxv,fyv,fzv),1,numpy.power(fxv*fyv+fzv,fxv*fyv),'float32'),
-            (fv+fy**fz,(fv,fy,fz),(fvv,fyv,fzv),2,fvv+fyv**fzv,'float32'),#fused with a dimshuffle
+            (fv+fy**fz,(fv,fy,fz),(fvv,fyv,fzv),2,fvv+fyv**fzv,'float32'),#fused with a dimshuffle #65
            (fv-fy+tensor.tanh(fz),(fv,fy,fz),(fvv,fyv,fzv),2,fvv-fyv+numpy.tanh(fzv),'float32'),#fused with a dimshuffle

            # Cases where the same input is reused many times.
@@ -2753,6 +2758,14 @@ def test_local_add_specialize():
    s = tensor.add(tensor.zeros_like(a))
    assert local_add_specialize.transform(s.owner)

+    # Test when the 0 input is forcing upcasting
+    a = tensor.constant(0, dtype='int64')
+    b = tensor.constant(1, dtype='int32')
+    s = a + b
+    transformed = local_add_specialize.transform(s.owner)
+    assert transformed
+    assert transformed[0].type == s.type
+
 def test_local_tensor_scalar_tensor():
    dtypes = ['int8', 'int16', 'int32', 'int64',
            'uint8', 'uint16', 'uint32', 'uint64',

--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
@@ -582,7 +582,8 @@ def makeSharedTester(shared_constructor_,
            shp = (1024,1024)

            #Test the case with all zeros element
-            for x in [numpy.asarray(numpy.random.rand(*shp), dtype=dtype),
+            rng = numpy.random.RandomState(utt.fetch_seed())
+            for x in [numpy.asarray(rng.rand(*shp), dtype=dtype),
                      numpy.zeros(shp, dtype=dtype)]:
                zeros = (x==0).all()
                x = self.cast_value(x)

--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -4,6 +4,7 @@
 #
 import unittest
 import numpy
+import theano
 from theano import gof

 from theano.gradient import *
@@ -250,6 +251,15 @@ class test_grad_sources_inputs(unittest.TestCase):
        self.assertTrue(g[a1.inputs[0]] == 6)
        self.assertTrue(g[a1.inputs[1]] == 11)

+def test_unimplemented_grad():
+    a = theano.tensor.vector()
+    b = theano.gradient.unimplemented_grad(theano.tensor.add, 1, a)
+    f = theano.function([a], b)
+    try:
+        f([1,2,3])
+        assert 0
+    except NotImplementedError:
+        pass

 if __name__ == '__main__':
    unittest.main()