merge w conflict in nvcc_compiler due to rpath thing

98213004 · James Bergstra · 8d431c50 · 7fa1cab9 · 98213004 · 98213004
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
+"""ProfileStats object for runtime and memory profiling.
+"""
+#
+# TODO: measure memory usage like ProfileMode did
+# TODO: put the optimization tips into a tips section??
+# TODO: add tip to use specify_shape (is specify_shape even in library doc?)
+# TODO: ensure field width for string fields makes columns line up
+# TODO: what to do about 'diff summary'? (ask Fred?)
+#
+__authors__   = "James Bergstra"
+__reviewer__  = "Razvan Pascanu"
+__copyright__ = "(c) 2011, Universite de Montreal"
+__license__   = "3-clause BSD License"
+__contact__   = "theano-dev <theano-dev@googlegroups.com>"
+__docformat__ = "restructuredtext en"
+import atexit
+import sys
+import theano
+config = theano.config
+_atexit_print_list = []
+_atexit_print_file = sys.stderr
+def _atexit_print_fn():
+    """Print ProfileStat objects in _atexit_print_list to _atexit_print_file
+    """
+    for ps in _atexit_print_list:
+        if ps.fct_callcount or ps.compile_time > 0:
+            ps.summary(file=_atexit_print_file)
+        else:
+            print 'Skipping empty Profile'
+atexit.register(_atexit_print_fn)
+class ProfileStats(object):
+    """
+    Object to store runtime and memory profiling information for all of
+    Theano's operations: compilation, optimization, execution.
+    """
+    #
+    # Note on implementation:
+    # Class variables are used here so that each one can be
+    # documented and initialized together.
+    # dictionary variables are initialized with None.
+    #
+    compile_time = 0.0
+    # Total time spent in body of orig_function,
+    # dominated by graph optimization and compilation of C
+    #
+    fct_call_time = 0.0
+    # The total time spent in Function.__call__
+    #
+    fct_callcount = 0
+    # Number of calls to Function.__call__
+    #
+    vm_call_time = 0.0
+    # Total time spent in Function.fn.__call__
+    #
+    apply_time = None
+    # dict from node -> float runtime
+    #
+    apply_callcount = None
+    # dict from node -> number of executions
+    #
+    apply_cimpl = None
+    # dict from node -> bool (1 if c, 0 if py)
+    #
+    message = None
+    # pretty string to print in summary, to identify this output
+    #
+    outputs_size = None
+    # node -> size of allocated output
+    #
+    optimizer_time = 0.0
+    # time spent optimizing graph (FunctionMaker.__init__)
+    linker_time = 0.0
+    # time spent linking graph (FunctionMaker.create)
+    def __init__(self, atexit_print=True, **kwargs):
+        """
+        atexit_print - bool. True means that this object will be printed to
+                       stderr (using .summary()) at the end of the program.
+        **kwargs - misc initializers. These should (but need not) match the
+                   names of the class vars declared in this class.
+        """
+        self.apply_callcount = {}
+        self.output_size = {}
+        self.apply_time = {}
+        self.apply_cimpl = {}
+        self.__dict__.update(kwargs)
+        #print >> sys.stderr, "self.message", self.message
+        if atexit_print:
+            global _atexit_print_list
+            _atexit_print_list.append(self)
+    def op_time(self):
+        """dict op -> total time on thunks"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, t in self.apply_time.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += t
+        return rval
+    def op_callcount(self):
+        """dict op -> total number of thunk calls"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += count
+        return rval
+    def op_nodes(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += 1
+        return rval
+    def op_impl(self):
+        """dict op -> total number of nodes"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        for node in self.apply_callcount:
+            if self.apply_cimpl[node]:
+                rval[node.op] = 'C '
+            else:
+                rval[node.op] = 'Py'
+        return rval
+    def op_flops(self):
+        """dict op -> total number of flops"""
+        # timing is stored by node, we compute timing by Op on demand
+        rval = {}
+        return rval #TODO: continue here
+        for node, count in self.apply_callcount.items():
+            rval.setdefault(node.op, 0)
+            rval[node.op] += 1
+        return rval
+        for a,t in op_time.items():
+            if hasattr(a,'flops'):
+                op_flops[a]=a.flops*op_call[a]/t/1e6
+        flops_msg=''
+        if op_flops:
+            flops_msg=' <MFlops/s>'
+            print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
+        print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> %s <nb_call> <nb apply> <Op name>'%(flops_msg)
+    def summary_ops(self, file=sys.stderr, N=None):
+        if self.apply_time:
+            local_time = sum(self.apply_time.values())
+        else:
+            local_time = 0
+        if local_time == 0:
+            print >> file, ('ProfileMode.summary_ops: total time 0'
+                    ' (did you forget to enable counters?)')
+            return
+        op_time = self.op_time()
+        op_call = self.op_callcount()
+        op_apply = self.op_nodes()
+        op_flops = self.op_flops()
+        op_impl = self.op_impl()
+        if N is None:
+            N = len(self.op_flops)
+        otimes = [(t*100/local_time,
+                    t,
+                    op,
+                    op_impl.get(op, '  '),
+                    op_call.get(op, 0),
+                    op_apply.get(op,0))
+                for op, t in op_time.items()]
+        otimes.sort()
+        otimes.reverse()
+        tot=0
+        print >> file, 'Ops'
+        print >> file, '---'
+        print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Op name>'
+        for f,t,a,impl,nb_call,nb_apply in otimes[:N]:
+            if nb_call == 0:
+                assert t == 0
+                continue
+            tot+=t
+            ftot=tot*100/local_time
+            if op_flops:
+                print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (
+                        f, ftot, t, tot, t/nb_call, impl, op_flops.get(a,-1), nb_call, nb_apply, a)
+            else:
+                print >>file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (
+                        f, ftot, t, tot, t/nb_call, impl, nb_call, nb_apply, a)
+        print >>file, '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(otimes)-N),
+                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[N:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[N:]))
+        print >> file, ''
+    def summary_nodes(self, file=sys.stderr, N=None):
+        if self.apply_time:
+            local_time = sum(self.apply_time.values())
+        else:
+            local_time = 0
+        if local_time == 0:
+            print >> file, ('ProfileMode.summary_nodes: total time 0'
+                    ' (did you forget to enable counters?)')
+            return
+        print >> file, 'Thunks'
+        print >> file, '------'
+        print >> file, '<% time> <cumulative %%> <apply time> <cumulative seconds> <time per call> <nb_call> <Apply Op name>'
+        atimes = [(
+                t*100/local_time,
+                t,
+                a,
+                self.apply_callcount[a])
+            for a, t in self.apply_time.items()]
+        atimes.sort()
+        atimes.reverse()
+        tot=0
+        for (f, t, a, nb_call) in atimes[:N]:
+            tot+=t
+            ftot=tot*100/local_time
+            if nb_call==0:
+                continue
+            print >> file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %i  %s'%(
+                    f, ftot, t, tot, t/nb_call,nb_call, str(a))
+        print >> file, '   ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(atimes)-N),
+                  sum(f for f, t, a, nb_call in atimes[N:]),
+                  sum(t for f, t, a, nb_call in atimes[N:]))
+        print >> file, ''
+    def summary_function(self, file):
+        print >> file, 'Function profiling'
+        print >> file, '=================='
+        print >> file, '  Message: %s'%self.message
+        print >> file, '  Time in %i calls to Function.__call__: %es' % (
+                self.fct_callcount, self.fct_call_time)
+        if self.fct_call_time>0:
+            print >> file, '  Time in Function.fn.__call__: %es (%.3f%%)' %(
+                    self.vm_call_time, 100*self.vm_call_time / self.fct_call_time)
+            local_time = sum(self.apply_time.values())
+            if local_time > 0:
+                print >> file, '  Time in thunks: %es (%.3f%%)' %(
+                        local_time, 100*local_time / self.fct_call_time)
+        print >> file, ''
+    def summary(self, file=sys.stderr, n_ops_to_print=20, n_applies_to_print=20):
+        self.summary_function(file)
+        local_time = sum(self.apply_time.values())
+        if local_time > 0:
+            self.summary_ops(file, n_ops_to_print)
+            self.summary_nodes(file, n_applies_to_print)
+        else:
+            print >> file, "  No node time accumulated (hint: try config profiling.time_thunks=1)"
+if 0: # old code still to be ported from ProfileMode
+    def long_print(self, file=sys.stderr, fct_name=None, message=None,
+            n_apply_to_print=15, n_ops_to_print=20, print_apply=False):
+        """
+        Print a readable summary of the stats.
+        param: n_apply_to_print the number of apply to print. Default 15.
+        param: n_ops_to_print the number of ops to print. Default 20.
+        """
+        local_time = sum(self.apply_time.values())
+        print ''
+        print 'ProfileMode.long_print()'
+        print 'name = %s'%fct_name
+        print 'msg = %s'%message
+        print '---------------------------'
+        print ''
+        print 'Total time spent running thunks: %.3fs'% local_time
+        sop_time={}
+        sop_call={}
+        sop_op = {}
+        sop_c={} #map each op class to Bool. True iff all applies were done in c.
+        for a,t in op_time.items():
+            typ = type(a)
+            sop_time.setdefault(typ,0)
+            sop_time[typ]+=t
+            sop_op.setdefault(typ,0)
+            sop_op[typ]+=1
+            sop_c.setdefault(typ,True)
+            sop_c[typ]=sop_c[typ] and op_cimpl.get(a, False)
+            sop_call[typ]=sop_call.get(typ,0)+op_call[a]
+        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative %%> <self seconds> <cumulative seconds> <time per call> <nb_call> <nb_op> <nb_op> <Op name>'
+        sotimes = [(t*100/local_time, t, a, sop_c[a], sop_call[a], sop_op[a]) for a, t in sop_time.items()]
+        sotimes.sort()
+        sotimes.reverse()
+        tot=0
+        for f,t,a,ci, nb_call, nb_op in sotimes[:n_ops_to_print]:
+            if nb_call == 0:
+                assert t == 0
+                continue
+            tot+=t
+            ftot=tot*100/local_time
+            if ci:
+                msg = '*'
+            else:
+                msg = ' '
+            print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_op, a)
+        print '   ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(sotimes)-n_ops_to_print),
+                  sum(f for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]),
+                  sum(t for f, t, a, ci, nb_call, nb_op in sotimes[n_ops_to_print:]))
+        total_time = time.time() - import_time
+        total_fct_time = sum(fct_call_time.values())
+        total_fct_call = sum(fct_call.values())
+        other_time = total_time - total_fct_time - compile_time
+        print
+        print 'Theano fct summary: <% total fct time> <total time> <time per call> <nb call> <fct name>'
+        for key in fct_call.keys():
+            if fct_call[key]>0:
+                print '   %4.1f%% %.3fs %.2es %d %s'%(fct_call_time[key]/total_fct_time*100 ,fct_call_time[key],
+                                                      fct_call_time[key]/fct_call[key], fct_call[key], key.name)
+            else:
+                print '   NOT CALLED',key.name
+        if total_fct_time>0:
+            time_pr_in_fct=local_time/total_fct_time*100
+            time_per_call=total_fct_time/total_fct_call
+        else:
+            time_pr_in_fct=0
+            time_per_call=0
+        print
+        print 'Time since import %.3fs'%(total_time)
+        print 'Compile time: %.3fs %.1f%%'%(compile_time, compile_time/total_time*100)
+        print 'Theano fct call %.3fs %.1f%%'%(total_fct_time,total_fct_time/total_time*100)
+        print '   Theano Op time (included in fct call, Time spent running thunks) %.3fs %.1f%%(of total) %.1f%%(of fct call)'% (local_time,local_time/total_time*100, time_pr_in_fct)
+        print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
+        print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
+        print
+        print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
+        print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
+        for fct in fct_call.keys():
+            for idx, node in enumerate(fct.maker.env.toposort()):
+                if any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.outputs) and not any(hasattr(i,'dtype') and i.dtype=='float64' for i in node.inputs):
+                    print str(node), idx, fct.name, str([getattr(i,'dtype',None) for i in node.inputs]),str([getattr(i,'dtype',None) for i in node.outputs])
+        if any([x[2].__name__.startswith("Gpu") for x in sotimes]):
+            cpu=[]
+            gpu=[]
+            trans=[]
+            for so in sotimes:
+                if so[2].__name__ in ["HostFromGpu", "GpuFromHost"]:
+                    trans.append(so)
+                elif so[2].__name__.startswith("Gpu"):
+                    gpu.append(so)
+                else:
+                    cpu.append(so)
+            sum_cpu=sum(so[1] for so in cpu)
+            sum_gpu=sum(so[1] for so in gpu)
+            sum_trans=sum(so[1] for so in trans)
+            print
+            print "Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op"%(
+                sum_cpu, sum_cpu/local_time*100, sum_gpu, sum_gpu/local_time*100, sum_trans, sum_trans/local_time*100)
+            print "Theano function input that are float64"
+            print "<fct name> <input name> <input type> <str input>"
+            for fct in fct_call.keys():
+                for i in fct.input_storage:
+                    if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
+                        print fct.name, i.name, i.type, i
+        if outputs_size:
+            fct_memory={}#env->dict(node->(outputs size))
+            var_mem = {}
+            for node,val in outputs_size.items():
+                fct_memory.setdefault(node.env,{})
+                fct_memory[node.env][node]=val
+                for out,v in zip(node.outputs,val):
+                    var_mem[out]=v
+            print
+            print "Profile of Theano functions memory:"
+            for env,nodes_mem in fct_memory.iteritems():
+                print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
+                size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
+                print "    Max without gc, inplace and view (KB)",size_sum/1024
+                node_memory_size = 0
+                node_memory_saved_by_view = 0
+                node_memory_saved_by_inplace = 0
+                running_memory_size = 0
+                running_max_memory_size = 0
+                post_thunk_old_storage = []
+                items = nodes_mem.items()
+                items.sort(key=lambda a: a[1])
+                items.reverse()
+                order = env.toposort()
+                computed, last_user = gc_helper(order)
+                for node in order:
+                    post_thunk_old_storage.append([ input_idx
+                                                    for input_idx,input in enumerate(node.inputs)
+                                                    if (input in computed) and (input not in env.outputs) and node == last_user[input]])
+                for node,val in items[:n_apply_to_print]:
+                    dmap = getattr(node.op,'destroy_map',None)
+                    vmap = getattr(node.op,'view_map',None)
+                    for idx,v in enumerate(val):
+                        if dmap and idx in dmap:#TODO check the op returned a view
+                            node_memory_saved_by_inplace += v
+                        elif vmap and idx in vmap:#TODO check the op returned a view
+                            node_memory_saved_by_view += v
+                        else:
+                            node_memory_size += v
+                            running_memory_size += v
+                            if running_memory_size > running_max_memory_size:
+                                running_max_memory_size = running_memory_size
+                            old_storage = post_thunk_old_storage[order.index(node)]
+                            for old_s in old_storage:
+                                running_memory_size -= var_mem[node.inputs[old_s]]
+                                pass
+                    pass
+                print "    Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
+                print "    Max FAST_RUN (KB)", running_max_memory_size/1024
+                print "    Memory saved by view (KB)", node_memory_saved_by_view/1024
+                print "    Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
+                print "    Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
+                n_apply_to_print+=10#TODO remove this line
+                print "    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
+                print "    <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
+                for key,val in items[:n_apply_to_print]:
+                    code = ['c']*len(node.outputs)
+                    for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
+                        code[out] = "i"
+                    for out,inp in getattr(key.op,'view_map',{}).iteritems():
+                        code[out] = "v"
+                    print '       %9dB  %s %s %s' % (sum(val), str(val), ' '.join(code), key)
+                print '   ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(nodes_mem)-n_ops_to_print),
+                  sum(sum(val) for key, val in items[n_ops_to_print:]),
+                  sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
+        print
+        print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
+        from theano import tensor as T
+        from theano.tensor.raw_random import RandomFunction
+        import theano
+        import theano.scalar as scal
+        scalar_op_amdlibm_no_speed_up = [scal.LT, scal.GT, scal.LE, scal.GE, scal.EQ, scal.NEQ, scal.InRange, scal.Switch, scal.OR, scal.XOR, scal.AND, scal.Invert, scal.Maximum, scal.Minimum, scal.Add, scal.Mul, scal.Sub, scal.TrueDiv, scal.IntDiv, scal.Clip, scal.First, scal.Second, scal.Identity, scal.Cast, scal.Sgn, scal.Neg, scal.Inv, scal.Sqr ]
+        scalar_op_amdlibm_speed_up = [scal.Mod, scal.Pow, scal.Ceil, scal.Floor, scal.RoundHalfToEven, scal.RoundHalfAwayFromZero, scal.Log, scal.Log2, scal.Log10, scal.Log1p, scal.Exp, scal.Sqrt, scal.Abs, scal.Cos,  scal.Sin,  scal.Tan,  scal.Tanh,  scal.Cosh,  scal.Sinh, T.nnet.sigm.ScalarSigmoid, T.nnet.sigm.ScalarSoftplus ]#Abs, Mod in float{32,64} only
+        def get_scalar_ops(s):
+            if isinstance(s, theano.scalar.Composite):
+                l = []
+                for node in s.env.toposort():
+                    l+=get_scalar_ops(node.op)
+                return l
+            else: return [s]
+        def list_scalar_op(op):
+            if isinstance(op.scalar_op, theano.scalar.Composite):
+                return get_scalar_ops(op.scalar_op)
+            else: return [op.scalar_op]
+        def amdlibm_speed_up(op):
+            if not isinstance(op, T.Elemwise):
+                return False
+            else:
+                l = list_scalar_op(op)
+                for s_op in l:
+                    if s_op.__class__ in scalar_op_amdlibm_speed_up:
+                        return True
+                    elif s_op.__class__ not in scalar_op_amdlibm_no_speed_up:
+                        import pdb;pdb.set_trace()
+                        print "We don't know if amdlibm will accelerate this scalar op.", s_op
+                return False
+        def exp_float32_op(op):
+            if not isinstance(op, T.Elemwise):
+                return False
+            else:
+                l = list_scalar_op(op)
+                return any([s_op.__class__ in [scal.Exp] for s_op in l])
+        #tip 1
+        if config.floatX=='float64':
+            print "  - Try the Theano flag floatX=float32"
+        #tip 2
+        if not config.lib.amdlibm and any([amdlibm_speed_up(a.op) for i,a in apply_time]):
+            print "  - Try installing amdlibm and set the Theano flag lib.amdlibm=True. This speed up only some Elemwise operation."
+        #tip 3
+        if not config.lib.amdlibm and any([exp_float32_op(a.op) and a.inputs[0].dtype=='float32' for i,a in apply_time]):
+            print "  - With the default gcc libm, exp in float32 is slower then in float64! Try Theano flags floatX=float64 or install amdlibm and set the theano flags lib.amdlibm=True"
+        #tip 4
+        for a, t in apply_time.iteritems():
+            node = a
+            if isinstance(node.op, T.Dot) and all([ len(i.type.broadcastable)==2 for i in node.inputs]):
+                print "  - You have a dot operation that was not optimized to dot22 that is faster. Make sure the inputs are float32 or 64 and are the same for both input. Currently they are:",[i.type for i in node.inputs]
+        #tip 5
+        for a, t in apply_time.iteritems():
+            node = a
+            if isinstance(node.op, RandomFunction):
+                print "  - Replace the default random number generator by 'from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams' as this is is faster. It is still experimental, but seam to work correctly."
+                if config.device.startswith("gpu"):
+                    print "     - MRG_RandomStreams is the only random number supported on the GPU."
+                break
+    def print_summary(self,
+                      n_apply_to_print=config.ProfileMode.n_apply_to_print,
+                      n_ops_to_print=config.ProfileMode.n_ops_to_print):
+        """ Print 3 summary that show where the time is spend. The first show an Apply-wise summary, the second show an Op-wise summary, the third show an type-Op-wise summary.
+        The Apply-wise summary print the timing information for the worst offending Apply nodes. This corresponds to individual Op applications within your graph which take the longest to execute (so if you use dot twice, you will see two entries there).
+        The Op-wise summary print the execution time of all Apply nodes executing the same Op are grouped together and the total execution time per Op is shown (so if you use dot twice, you will see only one entry there corresponding to the sum of the time spent in each of them). If two Op have different hash value, they will be separate.
+        The type-Op-wise summary group the result by type of op. So event if two Op have different hash value, they will be merged.
+        Their is an hack with the Op-wise summary. Go see it if you want to know more.
+        :param n_apply_to_print: the number of apply to print. Default 15, or n_ops_to_print flag.
+        :param n_ops_to_print: the number of ops to print. Default 20, or n_apply_to_print flag.
+        """
+        fct_call_time = self.mode.fct_call_time
+        fct_call = self.mode.fct_call
+        apply_time = self.apply_time
+        op_cimpl = self.op_cimpl
+        message = self.message
+        outputs_size = self.outputs_size
+        self.print_summary_("print_summary",
+                None,
+                None,
+                None,
+                apply_time,
+                op_cimpl,
+                message,
+                outputs_size,
+                n_apply_to_print,
+                n_ops_to_print)
+    def print_diff_summary(self, other, n_apply_to_print=15, n_ops_to_print=20):
+        """ As print_summary, but print the difference on two different profile mode.
+        TODO: Also we don't print the Apply-wise summary as it don't work for now.
+        TODO: make comparaison with gpu code.
+        :param other: the other instance of ProfileMode that we want to be compared to.
+        :param n_apply_to_print: the number of apply to print. Default 15.
+        :param n_ops_to_print: the number of ops to print. Default 20.
+        """
+        def diff_dict(a_time,b_time_):
+            r = {}
+            b_time = copy.copy(b_time_)
+            for a,ta in a_time.items():
+                r.setdefault(a,0)
+                tb = b_time.pop(a,0)
+                r[a]+=ta-tb
+            #they are missing in a
+            for a,t in b_time.items():
+                r.setdefault(a,0)
+                r[a]+=t
+            return r
+        compile_time = self.compile_time-other.compile_time
+        fct_call_time = diff_dict(self.fct_call_time,other.fct_call_time)
+        fct_call = diff_dict(self.fct_call,other.fct_call)
+        apply_time = diff_dict(self.apply_time, other.apply_time)
+        op_cimpl = self.op_cimpl and other.op_cimpl
+        message = self.message
+        outputs_size = diff_dict(self.outputs_size,other.outputs_size)
+        self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
+                            apply_time, op_cimpl, message, outputs_size,
+                            n_apply_to_print=n_apply_to_print,
+                            n_ops_to_print=n_ops_to_print, print_apply=False)
--- a/theano/compile/tests/test_modes.py
+++ b/theano/compile/tests/test_modes.py
+"""
+Test compilation modes
+"""
+from nose.plugins.skip import SkipTest
+import unittest
+import theano
+import numpy
+import random
+import numpy.random
+from theano.tests  import unittest_tools as utt
+class T_bunch_of_modes(unittest.TestCase):
+    def test1(self):
+        # this is a quick test after the LazyLinker branch merge
+        # to check that all the current modes can still be used.
+        linker_classes_involved = []
+        for modename in theano.config.__class__.__dict__['mode'].all:
+            x = T.matrix()
+            y = T.vector()
+            f = theano.function([x,y], x+y, mode=modename)
+            # test that it runs something
+            f([[1,2],[3,4]], [5, 6])
+            linker_classes_involved.append(f.maker.mode.linker.__class__)
+            print 'MODE:', modename, f.maker.mode.linker, 'stop'
+        # regression check:
+        # there should be
+        # - VM_Linker
+        # - OpWiseCLinker (FAST_RUN)
+        # - WrapLinker (PROFILE_MODE)
+        # - PerformLinker (FAST_COMPILE)
+        # - DebugMode's Linker  (DEBUG_MODE)
+        assert 5 == len(set(linker_classes_involved))
+if __name__ == '__main__':
+    unittest.main()
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -146,7 +146,7 @@ from link import \
    Container, Linker, LocalLinker, PerformLinker, WrapLinker, WrapLinkerMany
 from op import \
-    Op
+    Op, PureOp
 from opt import (Optimizer, optimizer, SeqOptimizer,
    MergeOptimizer, MergeOptMerge, 

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -13,7 +13,7 @@ AddConfigVar('nvcc.compiler_bindir',
        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
        StrParam(""))
-AddConfigVar('cuda.nvccflags',
+AddConfigVar('nvcc.flags',
        "Extra compiler flags for nvcc",
        StrParam(""))
@@ -183,11 +183,9 @@ def nvcc_module_compile_str(
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
            rpaths.append(os.path.join(config.cuda.root,'lib64'))
    for rpath in rpaths:
        cmd.extend(['-Xlinker',','.join(['-rpath',rpath])])
-    nvccflags = [flag for flag in config.cuda.nvccflags.split(' ') if flag]
+    cmd.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
-    cmd.extend(nvccflags)
    cmd.extend('-I%s'%idir for idir in include_dirs)
    cmd.extend(['-o',lib_filename])
    cmd.append(os.path.split(cppfilename)[-1])

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -133,6 +133,79 @@ def sp_ones_like(x):
    data, indices, indptr, shape = csm_properties(x) #TODO: don't restrict to CSM formats
    return CSM(format=x.format)(tensor.ones_like(data), indices, indptr, shape)
+class _sparse_py_operators:
+    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
+    def __neg__(self): return neg(self)
+    def __add__(left, right): return add(left, right)
+    def __radd__(right, left): return add(left, right)
+    def __sub__(left, right): return sub(left, right)
+    def __rsub__(right, left): return sub(left, right)
+    def __mul__(left, right): return mul(left, right)
+    def __rmul__(left, right): return mul(left, right)
+    #extra pseudo-operator symbols
+    def __dot__(left, right): return structured_dot(left, right)
+    def __rdot__(right, left): return structured_dot(left, right)
+    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
+    #     Discussion with Fred & James (at least, and maybe others before)
+    #     we decided that casting from a sparse to dense should be explicit
+    #     because it's usually something you want to be pretty careful about,
+    #     and not to do by accident.
+    #def _as_TensorVariable(self):
+    #    return dense_from_sparse(self)
+    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
+    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
+    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
+    # ... your sparse matrix just to get the shape.
+    ndim = property(lambda self: self.type.ndim)
+    dtype = property(lambda self: self.type.dtype)
+class SparseVariable(gof.Variable, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def __str__(self):
+        return '%s{%s,%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype)
+    def __repr__(self):
+        return str(self)
+class SparseConstantSignature(tuple):
+    def __eq__(self, other):
+        (a, b), (x,y) = self, other
+        return a == x \
+                and (b.dtype == y.dtype)\
+                and (type(b) == type(y))\
+                and (b.shape == y.shape)\
+                and (abs(b-y).sum() < 1e-6 * b.nnz)
+    def __hash__(self):
+        (a,b) = self
+        return hash(type(self)) ^ hash(a) ^ hash(type(b))
+class SparseConstant(gof.Constant, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
+    def signature(self):
+        assert self.data is not None
+        return SparseConstantSignature((self.type, self.data))
+    def __str__(self):
+        return '%s{%s,%s,shape=%s,nnz=%s}'%(
+                self.__class__.__name__,
+                self.format,
+                self.dtype,
+                self.data.shape,
+                self.data.nnz)
+    def __repr__(self):
+        return str(self)
+class SparseValue(gof.Value, _sparse_py_operators):
+    dtype = property(lambda self: self.type.dtype)
+    format = property(lambda self: self.type.format)
 class SparseType(gof.Type):
    """
@@ -149,6 +222,9 @@ class SparseType(gof.Type):
    dtype_set = set(['int', 'int8', 'int16','int32', 'int64', 'float32', 'float64', 'complex64','complex128'])
    ndim = 2
+    Variable = SparseVariable
+    Constant = SparseConstant
    def __init__(self, format, dtype):
        """
        Fundamental way to create a sparse node.
@@ -248,65 +324,6 @@ csr_dmatrix = SparseType(format='csr', dtype='float64')
 csc_fmatrix = SparseType(format='csc', dtype='float32')
 csr_fmatrix = SparseType(format='csr', dtype='float32')
-class _sparse_py_operators:
-    T = property(lambda self: transpose(self), doc = "Return aliased transpose of self (read-only)")
-    def __neg__(self): return neg(self)
-    def __add__(left, right): return add(left, right)
-    def __radd__(right, left): return add(left, right)
-    def __sub__(left, right): return sub(left, right)
-    def __rsub__(right, left): return sub(left, right)
-    def __mul__(left, right): return mul(left, right)
-    def __rmul__(left, right): return mul(left, right)
-    #extra pseudo-operator symbols
-    def __dot__(left, right): return structured_dot(left, right)
-    def __rdot__(right, left): return structured_dot(left, right)
-    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
-    #     Discussion with Fred & James (at least, and maybe others before)
-    #     we decided that casting from a sparse to dense should be explicit
-    #     because it's usually something you want to be pretty careful about,
-    #     and not to do by accident.
-    #def _as_TensorVariable(self):
-    #    return dense_from_sparse(self)
-    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
-    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
-    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
-    # ... your sparse matrix just to get the shape.
-    ndim = property(lambda self: self.type.ndim)
-    dtype = property(lambda self: self.type.dtype)
-class SparseVariable(gof.Variable, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-class SparseConstantSignature(tuple):
-    def __eq__(self, other):
-        (a, b), (x,y) = self, other
-        return a == x \
-                and (b.dtype == y.dtype)\
-                and (type(b) == type(y))\
-                and (b.shape == y.shape)\
-                and (abs(b-y).sum() < 1e-6 * b.nnz)
-    def __hash__(self):
-        (a,b) = self
-        return hash(type(self)) ^ hash(a) ^ hash(type(b))
-class SparseConstant(gof.Constant, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
-    def signature(self):
-        assert self.data is not None
-        return SparseConstantSignature((self.type, self.data))
-class SparseValue(gof.Value, _sparse_py_operators):
-    dtype = property(lambda self: self.type.dtype)
-    format = property(lambda self: self.type.format)
 # CONSTRUCTION
 class CSMProperties(gof.Op):
    """Extract all of .data .indices and .indptr"""

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -937,6 +937,9 @@ def _gemm_from_node2(node):
        lst = _factor_canonicalized(lst)
        rval = _gemm_from_factored_list(lst)
        #print "RVAL", rval
+        # THIS GOT COMMENTED OUT AT SOME POINT - ASK P.Lamblin maybe why?
+        #if rval:
+        #    assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
        if rval and (rval[0].type == node.outputs[0].type):
            return rval

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3057,30 +3057,33 @@ def constant_folding(node):
    for input in node.inputs:
        if not isinstance(input, Constant):
            return False
-    try:
+    #condition:  all inputs are constant
-        storage = [[None] for output in node.outputs]
-        node.op.perform(node, [x.data for x in node.inputs], storage)
+    storage_map=dict([(i,[i.data]) for i in node.inputs])
-    except MethodNotDefined:
+    compute_map=dict([(i,[True]) for i in node.inputs])
-        tmp_inputs = [x.type() for x in node.inputs]
+    for o in node.outputs:
-        f = compile.function(
+        storage_map[o] = [None]
-                inputs=tmp_inputs,
+        compute_map[o] = [False]
-                outputs=node.op.make_node(*tmp_inputs).outputs,
-                mode=compile.Mode(linker='c|py',optimizer=None))
+    thunk = node.op.make_thunk(node, storage_map, compute_map,
-        xvals = f(*[x.data for x in node.inputs])
+            no_recycling=[])
-        storage = [[xv] for xv in xvals]
+    required = thunk()
-    msg = []
+    assert not required # a node whose inputs are all provided should always
-    assert len(storage) == len(node.outputs)
+    # return successfully
-    for s, output in zip(storage, node.outputs):
+    rval = []
+    for output in node.outputs:
+        assert compute_map[output][0], (output, storage_map[output][0])
        try:
            constant = output.type.Constant
-        except:
+        except AttributeError:
            constant = Constant
-        msg += [constant(output.type, s[0])]
+        rval.append(constant(output.type, storage_map[output][0]))
-    return msg
+    return rval
 register_canonicalize(constant_folding, 'fast_compile')
-register_stabilize(constant_folding) # because
+register_stabilize(constant_folding)
 register_specialize(constant_folding)
 def _is_1(expr):

--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py
@@ -49,11 +49,14 @@ class T_random_function(unittest.TestCase):
        rng_R = random_state_type()
        # use make_node to override some of the self.args
-        post_r2, out2 = rf2(rng_R, (4,), -2, 2)
+        post_r2,     out2     = rf2(rng_R, (4,), -2, 2) # NOT INPLACE
-        post_r2_4, out2_4 = rf2(rng_R, (4,), -4.0, 2)
+        post_r4,     out4     = rf4(rng_R, (4,), -4, 4) # INPLACE
-        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0)
+        post_r2_4,   out2_4   = rf2(rng_R, (4,), -4.0, 2) # NOT INPLACE
-        post_r4, out4 = rf4(rng_R, (4,), -4, 4)
+        post_r2_4_4, out2_4_4 = rf2(rng_R, (4,), -4.0, 4.0) # NOT INPLACE
+        # configure out4 to be computed inplace
+        # The update expression means that the random state rng_R will
+        # be maintained by post_r4
        f = compile.function(
                [compile.In(rng_R,
                            value=numpy.random.RandomState(utt.fetch_seed()),
@@ -65,9 +68,25 @@ class T_random_function(unittest.TestCase):
        f2, f4, f2_4, f2_4_4 = f()
        f2b, f4b, f2_4b, f2_4_4b = f()
-        assert numpy.allclose(f2*2, f4)
+        print f2
-        assert numpy.allclose(f2_4_4, f4)
+        print f4
-        assert not numpy.allclose(f4, f4b)
+        print f2_4
+        print f2_4_4
+        #print f2b
+        #print f4b
+        #print f2_4b
+        #print f2_4_4b
+        # setting bounds is same as multiplying by 2
+        assert numpy.allclose(f2*2, f4), (f2, f4)
+        # retrieving from non-inplace generator
+        # is same as inplace one for first call
+        assert numpy.allclose(f2_4_4, f4), (f2_4_4, f4)
+        # f4 changes from call to call, that the update has worked
+        assert not numpy.allclose(f4, f4b), (f4, f4b)
    def test_inplace_optimization(self):
        """Test that FAST_RUN includes the random_make_inplace optimization"""

--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
@@ -13,19 +13,32 @@ from theano.tests  import unittest_tools as utt
  should ensure that it will remain operational
 '''
-class T_diverse(unittest.TestCase):
+class T_scipy(unittest.TestCase):
    def setUp(self):
        utt.seed_rng()
+        self.orig_floatX = theano.config.floatX
+    def tearDown(self):
+        theano.config.floatX = self.orig_floatX
-    def scipy_paper_example1(self):
+    def test_scipy_paper_example1(self):
        a = theano.tensor.vector('a') # declare variable
        b = a + a**10                 # build expression
        f = theano.function([a], b)   # compile function
        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
-    def scipy_papaer_example2(self):
+    def test_scipy_paper_example2(self):
        ''' This just sees if things compile well and if they run '''
+        # PREAMPBLE
+        T = theano.tensor
+        shared = theano.shared
+        function = theano.function
+        rng = numpy.random
+        theano.config.floatX='float64'
+        #
+        # ACTUAL SCRIPT FROM PAPER
        x = T.matrix()
        y = T.vector()
        w = shared(rng.randn(100))
@@ -52,6 +65,7 @@ class T_diverse(unittest.TestCase):
        for i in range(training_steps):
            pred, err = train(D[0], D[1])
 if __name__ == '__main__':
    unittest.main()