Merge pull request #1277 from nouiz/memory_prof

Memory profiler in the new profiler

Merge pull request #1277 from nouiz/memory_prof
91a31814 · lamblin · c4a3bd9f · 33ded753 · 91a31814 · 91a31814
--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -106,6 +106,38 @@ default values.

      *Default:* ``id(self)``

+    .. method:: get_shape_info(obj)
+
+      Optional. Only needed to profile the memory of this Type of object
+
+      Return the information needed to compute the memory size of obj.
+
+      The memory size is only the data, so this exclude the container.
+      For an ndarray, this is the data, but not the ndarray object and
+      others data structures as shape and strides.
+
+      get_shape_info() and get_size() work in tendem for the memory profiler.
+
+      get_shape_info() is called during the execution of the function.
+      So it is better that it is not too slow.
+
+      get_size() will be called with the output of this function
+      when printing the memory profile.
+
+      :param obj: The object that this Type represent during execution
+      :return: Python object that self.get_size() understand
+
+
+    .. method:: get_size(shape_info)
+
+        Number of bytes taken by the object represented by shape_info
+
+        Optional. Only needed to profile the memory of this Type of object
+
+	:param shape_info: the output of the call to get_shape_info()
+        :return: the number of bytes taken by the object described in
+          shape_info.
+        """
 For each method, the *default* is what ``Type`` defines
 for you. So, if you create an instance of ``Type`` or an
 instance of a subclass of ``Type``, you

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -273,6 +273,15 @@ import theano and print the config variable, as in:

    Do the vm/cvm linkers profile the execution of Theano functions?

+.. attribute:: profile_memory
+
+    Bool value: either True or False
+
+    Default False
+
+    Do the vm/cvm linkers profile the memory of Theano functions get printed?
+    It only work when profile=True.
+
 .. attribute:: profile_optimizer

    Bool value: either True or False
@@ -280,6 +289,26 @@ import theano and print the config variable, as in:
    Default False

    Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
+    It only work when profile=True.
+
+.. attribute:: profiling.n_apply
+
+    Positive int value, default: 20.
+
+    The number of apply node to print in the profiler output
+
+.. attribute:: profiling.n_ops
+
+    Positive int value, default: 20.
+
+    The number of ops to print in the profiler output
+
+.. attribute:: profiling.min_memory_size
+
+    Positive int value, default: 1024.
+
+    For the memory profile, do not print apply nodes if the size
+    of their outputs (in bytes) is lower then this.

 .. attribute:: config.lib.amdlibm


--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -60,7 +60,7 @@ from theano.compile import \
    FunctionMaker, function, OpFromGraph, \
    Component, External, Member, Method, \
    Composite, ComponentList, ComponentDict, Module, \
-    ProfileMode, \
+    ProfileMode, ProfileStats, \
    Param, shared

 from theano.misc.safe_asarray import _asarray

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode

 from theano.compile.monitormode import MonitorMode

+from theano.compile.profiling import ProfileStats, ScanProfileStats
+
 from theano.compile.profilemode import ProfileMode

-from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable
+from theano.compile.sharedvalue import (shared, shared_constructor,
+                                        SharedVariable)
 from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared

 from theano.compile.function import function
-
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
 import atexit
 import copy
+import os
 import time

+import theano
 from theano.gof.link import WrapLinker
 from theano.compile.mode import (Mode, register_mode,
                                 predefined_modes, predefined_linkers,
@@ -42,6 +44,18 @@ class Profile_Maker(FunctionMaker):
    def create(self, input_storage=None, trustme=False):
        ret = super(Profile_Maker, self).create(input_storage, trustme)

+        if (hasattr(theano, 'sandbox') and
+            hasattr(theano.sandbox, 'cuda') and
+            theano.sandbox.cuda.cuda_enabled):
+            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
+                raise Exception(
+                    "You are running Theano profiler with CUDA enabled."
+                    " Theano GPU ops execution are asynchron by default."
+                    " So by default, the profile is useless."
+                    " You must use set the environment variable"
+                    " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
+                    " synchonize the execution to get meaning full profile.")
+
        # create a function-specific storage container for profiling info
        profile = ProfileStats(atexit_print=False)
        self.mode.profile_stats[ret] = profile
@@ -50,7 +64,6 @@ class Profile_Maker(FunctionMaker):
        #initialize the timers
        for i, node in enumerate(ret.maker.fgraph.toposort()):
            profile.apply_time[node] = 0.0
-            profile.outputs_size[node] = [0.0] * len(node.outputs)

            # a thunk_group is a list of the thunks from each linker
            # corresponding to the i'th position in the toposort.
@@ -70,13 +83,13 @@ class Profile_Maker(FunctionMaker):

        def new_fn():
            self.mode.apply_time = self.mode.profile_stats[ret].apply_time
-            self.mode.outputs_size = self.mode.profile_stats[ret].outputs_size
+            self.mode.variable_shape = self.mode.profile_stats[ret].variable_shape
            ret_fn()
            # delete the old apply_time variable
            # because it doesn't mean the same thing anymore.
            # This prevents old code from looking like it still works.
            del self.mode.apply_time
-            del self.mode.outputs_size
+            del self.mode.variable_shape

        ret.fn = new_fn

@@ -165,34 +178,10 @@ class ProfileMode(Mode):
                t0 = time.time()
                th()
                dt = time.time() - t0
-            size = []
-            for o in th.outputs:
-                if not hasattr(o[0], 'size'):
-                    #if the output type don't have a size attribute, set -1
-                    #to signify we can't evaluate it.
-                    #This happen at least for mtrand.RandomState type(in numpy)
-                    size.append(-1)
-                    continue
-                s = o[0].size
-                #can't use o[0].dtype.itemsize as dtype is a str for
-                #CudaNdarray
-                dtype = str(o[0].dtype)
-                dtype2 = dtype[-2:]
-                if dtype2 == '32':
-                    s *= 4
-                elif dtype2 == '64':
-                    s *= 8
-                elif dtype2 == '16':
-                    s *= 2
-                elif dtype[-1] == '8':
-                    s *= 1
-                elif dtype[-3:] == '128':
-                    s *= 16
-                else:
-                    raise Exception("Can't determine the memory size of dtype",
-                                    o[0].dtype)
-                size.append(s)
-            self.outputs_size[node] = size
+            for var, data in zip(node.outputs, th.outputs):
+                sh = getattr(data[0], 'shape', 'input no shape')
+                self.variable_shape[var] = sh
+
            self.apply_time[node] += max(dt, 1e-14)

        self.provided_linker = linker
@@ -247,15 +236,14 @@ class ProfileMode(Mode):
                print i, n

        apply_cimpl = {}
-        outputs_size = {}
        for fn, ps in self.profile_stats.items():
            apply_cimpl.update(ps.apply_cimpl)

        message = self.message

-        outputs_size = {}
+        variable_shape = {}
        for fn, ps in self.profile_stats.items():
-            outputs_size.update(ps.outputs_size)
+            variable_shape.update(ps.variable_shape)

        other_time = dict(
                linker_time=sum(
@@ -265,7 +253,7 @@ class ProfileMode(Mode):

        self.print_summary_("print_summary",
                            compile_time, fct_call_time, fct_call,
-                            apply_time, apply_cimpl, message, outputs_size,
+                            apply_time, apply_cimpl, message, variable_shape,
                            self.local_time, other_time,
                            **kwargs)

@@ -300,19 +288,19 @@ class ProfileMode(Mode):
        apply_time = diff_dict(self.apply_time, other.apply_time)
        apply_cimpl = self.apply_cimpl and other.apply_cimpl
        message = self.message
-        outputs_size = diff_dict(self.outputs_size, other.outputs_size)
+        variable_shape = diff_dict(self.variable_shape, other.variable_shape)
        other_time = {'linker_time': self.linker_time - other.linker_time,
                      'optimizer_time': self.optimizer_time -
                                        other.optimizer_time}
        self.print_summary_("print_diff_summary", compile_time,
                            fct_call_time, fct_call,
-                            apply_time, apply_cimpl, message, outputs_size,
+                            apply_time, apply_cimpl, message, variable_shape,
                            print_apply=False, other_time=other_time,
                            **kwargs)

    @staticmethod
    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
-                       apply_time, apply_cimpl, message, outputs_size,
+                       apply_time, apply_cimpl, message, variable_shape,
                       local_time, other_time,
                       n_apply_to_print=config.ProfileMode.n_apply_to_print,
                       n_ops_to_print=config.ProfileMode.n_ops_to_print,
@@ -329,6 +317,10 @@ class ProfileMode(Mode):
                                whose outputs memory size is lower then that.
        """

+        print "ProfileMode is deprecated! Use the new profiler."
+        print " The Theano flags to enable it ise: profile=True"
+        print " The Theano flags for the memory profile to it is: profile_memory=True"
+
        total_time = time.time() - import_time
        total_fct_time = sum(fct_call_time.values())
        total_fct_call = sum(fct_call.values())
@@ -502,96 +494,17 @@ class ProfileMode(Mode):
            print '(*) Op is running a c implementation'
        for printer in profiler_printers:
            printer(fct_name, compile_time, fct_call_time, fct_call,
-                    apply_time, apply_cimpl, message, outputs_size,
+                    apply_time, apply_cimpl, message, variable_shape,
                    other_time)

-        if not outputs_size:
+        if not variable_shape:
            print """\nProfile of Theano intermediate memory disabled.
 To enabled, put the Theano flag ProfileMode.profile_memory to True."""
        else:
-            fct_memory={}#fgraph->dict(node->(outputs size))
-            var_mem = {}
-            for node, val in outputs_size.items():
-                fct_memory.setdefault(node.fgraph, {})
-                fct_memory[node.fgraph][node]=val
-                for out,v in zip(node.outputs,val):
-                    var_mem[out]=v
-            print
-            print "Profile of Theano functions memory:"
-            print "(This check only the output of each apply node. It don't check the temporary memory used by the op in the apply node.)"
-            nb_skipped = 0
-            for fgraph, nodes_mem in fct_memory.iteritems():
-                size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
-                if size_sum < min_memory_size:
-                    nb_skipped += 1
-                    continue
-                print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
-                print "    Max without gc, inplace and view (KB)",size_sum/1024
-
-                node_memory_size = 0
-                node_memory_saved_by_view = 0
-                node_memory_saved_by_inplace = 0
-                running_memory_size = 0
-                running_max_memory_size = 0
-                post_thunk_old_storage = []
-                items = nodes_mem.items()
-                items.sort(key=lambda a: a[1])
-                items.reverse()
-
-                order = self.linker.schedule(fgraph)
-                computed, last_user = gof.link.gc_helper(order)
-                for node in order:
-                    post_thunk_old_storage.append([ input_idx
-                                                    for input_idx,input in enumerate(node.inputs)
-                                                    if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
-                for node,val in items[:n_apply_to_print]:
-                    dmap = getattr(node.op,'destroy_map',None)
-                    vmap = getattr(node.op,'view_map',None)
-
-                    for idx,v in enumerate(val):
-                        if dmap and idx in dmap:#TODO check the op returned a view
-                            node_memory_saved_by_inplace += v
-                        elif vmap and idx in vmap:#TODO check the op returned a view
-                            node_memory_saved_by_view += v
-                        else:
-                            node_memory_size += v
-                            running_memory_size += v
-                            if running_memory_size > running_max_memory_size:
-                                running_max_memory_size = running_memory_size
-                            old_storage = post_thunk_old_storage[order.index(node)]
-                            for old_s in old_storage:
-                                running_memory_size -= var_mem[node.inputs[old_s]]
-                                pass
-                    pass
-
-                print "    Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
-                print "    Max FAST_RUN (KB)", running_max_memory_size/1024
-                print "    Memory saved by view (KB)", node_memory_saved_by_view/1024
-                print "    Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
-                print "    Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
-
-                n_apply_to_print+=10#TODO remove this line
-                print "    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
-                print "    <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
-
-                n_apply_printed = 0
-                for key,val in items[:n_apply_to_print]:
-                    if sum(val) < min_memory_size:
-                        break
-                    code = ['c']*len(node.outputs)
-                    for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
-                        code[out] = "i"
-                    for out,inp in getattr(key.op,'view_map',{}).iteritems():
-                        code[out] = "v"
-                    print '       %9dB  %s %s %s' % (sum(val), str(val), ' '.join(code), key)
-                    n_apply_printed += 1
-
-                print '   ... (remaining %i Apply account for %.2f%%(%d bytes) of the total intermediate memory used)'\
-                %(max(0, len(nodes_mem)-n_apply_printed),
-                  sum(sum(val) for key, val in items[n_apply_printed:])/float(size_sum),
-                  sum(sum(val) for key, val in items[n_apply_printed:]))
-            if nb_skipped > 0:
-                print '   We skipped %d theano function(s). Each of them used less then %dB(theano flags ProfileMode.min_memory_size) of total intermediate memory size'%(nb_skipped, min_memory_size)
+            print """
+            The memory profile in ProfileMode is removed!
+            Use the new profiler. Use the Theano flags
+            profile=True,profile_memory=True to enable it."""

        print
        print """Here are tips to potentially make your code run faster

--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -16,13 +16,15 @@ __contact__ = "theano-dev <theano-dev@googlegroups.com>"
 __docformat__ = "restructuredtext en"
 import atexit
 import copy
+import os
 import sys
 import time

 import numpy

 import theano
-from theano.configparser import AddConfigVar, BoolParam
+from theano.configparser import AddConfigVar, BoolParam, IntParam
+

 import_time = time.time()
 config = theano.config
@@ -32,7 +34,23 @@ _atexit_print_file = sys.stderr

 AddConfigVar('profiling.time_thunks',
             """Time individual thunks when profiling""",
-        BoolParam(True))
+             BoolParam(True))
+
+AddConfigVar('profiling.n_apply',
+             "Number of apply instances to print by default",
+             IntParam(20, lambda i: i > 0),
+             in_c_key=False)
+
+AddConfigVar('profiling.n_ops',
+             "Number of ops to print by default",
+             IntParam(20, lambda i: i > 0),
+             in_c_key=False)
+
+AddConfigVar('profiling.min_memory_size',
+             """For the memory profile, do not print apply nodes if the size
+             of their outputs (in bytes) is lower then this threshold""",
+             IntParam(1024, lambda i: i >= 0),
+             in_c_key=False)


 def _atexit_print_fn():
@@ -41,7 +59,9 @@ def _atexit_print_fn():
    printed = 0
    for ps in _atexit_print_list:
        if ps.fct_callcount or ps.compile_time > 0:
-            ps.summary(file=_atexit_print_file)
+            ps.summary(file=_atexit_print_file,
+                       n_ops_to_print=config.profiling.n_ops,
+                       n_apply_to_print=config.profiling.n_apply)
            printed += 1
        else:
            print 'Skipping empty Profile'
@@ -59,7 +79,7 @@ def _atexit_print_fn():

            #merge dictonary
            for attr in ["apply_time", "apply_callcount",
-                         "apply_cimpl", "outputs_size"]:
+                         "apply_cimpl", "variable_shape", "variable_strides"]:
                cum_attr = getattr(cum, attr)
                for key, val in getattr(ps, attr).iteritems():
                    assert key not in cum_attr
@@ -73,7 +93,9 @@ def _atexit_print_fn():
            else:
                cum.optimizer_profile = None

-        cum.summary(file=_atexit_print_file)
+        cum.summary(file=_atexit_print_file,
+                    n_ops_to_print=config.profiling.n_ops,
+                    n_apply_to_print=config.profiling.n_apply)


 atexit.register(_atexit_print_fn)
@@ -125,8 +147,12 @@ class ProfileStats(object):
    # pretty string to print in summary, to identify this output
    #

-    outputs_size = None
-    # node -> size of allocated output
+    variable_shape = {}
+    # Variable -> shapes
+    #
+
+    variable_strides = {}
+    # Variable -> strides
    #

    optimizer_time = 0.0
@@ -154,11 +180,24 @@ class ProfileStats(object):
        **kwargs - misc initializers. These should (but need not) match the
                   names of the class vars declared in this class.
        """
+        if (hasattr(theano, 'sandbox') and
+            hasattr(theano.sandbox, 'cuda') and
+            theano.sandbox.cuda.cuda_enabled):
+            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
+                raise Exception(
+                    "You are running Theano profiler with CUDA enabled."
+                    " Theano GPU ops execution are asynchron by default."
+                    " So by default, the profile is useless."
+                    " You must use set the environment variable"
+                    " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
+                    " synchonize the execution to get meaning full profile.")
+
        self.apply_callcount = {}
        self.output_size = {}
        self.apply_time = {}
        self.apply_cimpl = {}
-        self.outputs_size = {}
+        self.variable_shape = {}
+        self.variable_strides = {}
        if flag_time_thunks is None:
            self.flag_time_thunks = config.profiling.time_thunks
        else:
@@ -242,7 +281,7 @@ class ProfileStats(object):
        return rval

    def op_impl(self):
-        """dict op -> total number of nodes"""
+        """dict op -> 'C' or 'Py' depending how the op is implemented"""
        # timing is stored by node, we compute timing by Op on demand
        rval = {}
        for node in self.apply_callcount:
@@ -520,6 +559,20 @@ class ProfileStats(object):
            print >> file, format_str %(f, ftot, t, t / nb_call, nb_call,
                                        nd_id,
                                        str(a)[:maxlen])
+            if not config.profile_memory:
+                continue
+            for idx, var in enumerate(a.inputs):
+                sh = self.variable_shape.get(var, 'no shape')
+                st = self.variable_strides.get(var, 'no strides')
+                dtype = getattr(var, 'dtype', 'no dtype')
+                print "    input %d: dtype=%s, shape=%s, strides=%s " % (
+                    idx, dtype, sh, st)
+            for idx, var in enumerate(a.outputs):
+                sh = self.variable_shape.get(var, 'no shape')
+                st = self.variable_strides.get(var, 'no strides')
+                dtype = getattr(var, 'dtype', 'no dtype')
+                print "    output %d: dtype=%s, shape=%s, strides=%s " % (
+                    idx, dtype, sh, st)
            # Same as before, this I've sacrificied some information making
            # the output more readable
            #print >> file, '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs %.2es  %i  %s'%(
@@ -555,21 +608,212 @@ class ProfileStats(object):
        # The validation time is a subset of optimizer_time
        assert self.validate_time < self.optimizer_time

+    def summary_memory(self, file, N=None):
+        fct_memory = {}  # fgraph->dict(node->[outputs size])
+        fct_shapes = {}  # fgraph->dict(node->[outputs shapes]))
+        var_mem = {}  # varible->size in bytes; don't include input variables
+        node_mem = {}  # node->total outputs size (only dense outputs)
+
+        for node in self.apply_callcount.keys():
+            fct_memory.setdefault(node.fgraph, {})
+            fct_memory[node.fgraph].setdefault(node, [])
+            fct_shapes.setdefault(node.fgraph, {})
+            fct_shapes[node.fgraph].setdefault(node, [])
+            sum_dense = 0
+            for out in node.outputs:
+                sh = self.variable_shape[out]
+                if hasattr(out.type, 'get_size'):
+                    v = out.type.get_size(sh)
+                    sum_dense += v
+                else:
+                    v = "Unknown"
+
+                var_mem[out] = v
+                fct_memory[node.fgraph][node].append(v)
+                fct_shapes[node.fgraph][node].append(sh)
+            node_mem[node] = sum_dense
+
+        #Find the function that used the most of that statistic
+        max_sum_size = 0
+        max_node_memory_size = 0
+        max_running_max_memory_size = 0
+        max_node_memory_saved_by_view = 0
+        max_node_memory_saved_by_inplace = 0
+        for fgraph, nodes_mem in fct_memory.iteritems():
+            # Sum of the size of all variables in bytes
+            sum_size = sum([sum([v for v in val if not isinstance(v, str)])
+                            for key, val in nodes_mem.iteritems()])
+            # Sum of the size of all variables that actually allocate
+            # memory (excluding views, and inplace);
+            node_memory_size = 0
+            # The sum of memory saved by returning view instead of new
+            # allocation
+            node_memory_saved_by_view = 0
+            # The sum of memory saved by reusing the input instead of
+            # new allocation
+            node_memory_saved_by_inplace = 0
+            # The memory allocated after the current apply node
+            running_memory_size = 0
+            # The maximum of running_memory_size during the function
+            running_max_memory_size = 0
+
+            order = fgraph.toposort()
+            # A list of intermediate variable that are not need
+            # after the execution of the corresponding node.
+            # It mean that after executing the node,
+            # the corresponding variable can be gc.
+            post_thunk_old_storage = []
+            computed, last_user = theano.gof.link.gc_helper(order)
+            for node in order:
+                post_thunk_old_storage.append([
+                    input_idx
+                    for input_idx, input in enumerate(node.inputs)
+                    if (input in computed) and
+                    (input not in fgraph.outputs) and
+                    node == last_user[input]])
+            for node in order:
+                val = nodes_mem[node]
+                dmap = getattr(node.op, 'destroy_map', None)
+                vmap = getattr(node.op, 'view_map', None)
+
+                for idx, v in enumerate(val):
+                    # TODO check the op returned a view
+                    if dmap and idx in dmap:
+                        node_memory_saved_by_inplace += v
+                    # TODO check the op returned a view
+                    elif vmap and idx in vmap:
+                        node_memory_saved_by_view += v
+                    elif not isinstance(v, str):
+                        node_memory_size += v
+                        running_memory_size += v
+                        if running_memory_size > running_max_memory_size:
+                            running_max_memory_size = running_memory_size
+                        old_storage = post_thunk_old_storage[order.index(node)]
+                        for old_s in old_storage:
+                            old_v = var_mem[node.inputs[old_s]]
+                            if not isinstance(old_v, str):
+                                running_memory_size -= old_v
+
+            # Store the max of some stats by any function in this profile.
+            max_sum_size = max(max_sum_size, sum_size)
+            max_node_memory_size = max(max_node_memory_size, node_memory_size)
+            max_running_max_memory_size = max(max_running_max_memory_size,
+                                          running_max_memory_size)
+            max_node_memory_saved_by_view = max(max_node_memory_saved_by_view,
+                                                node_memory_saved_by_view)
+            max_node_memory_saved_by_inplace = max(
+                max_node_memory_saved_by_inplace, node_memory_saved_by_inplace)
+
+            del fgraph, nodes_mem, post_thunk_old_storage, node
+
+        if len(fct_memory) > 1:
+            print >> file,  ("Memory Profile "
+                             "(the max between all function in that profile)")
+        else:
+            print >> file,  "Memory Profile"
+
+        print >> file, "(Sparse variables are ignored)"
+
+        print >> file,  "---"
+#        print >> file,  "    Max if no gc, inplace and view: %dKB" % int(
+#            round(max_sum_size / 1024))
+        print >> file,  "    Max if linker=cvm (default): unknow"
+        print >> file,  "    Max if no gc (allow_gc=False): %dKB" % int(round(
+                             max_node_memory_size / 1024.))
+        print >> file,  "    Max if linker=c|py: %dKB" % int(round(
+            max_running_max_memory_size / 1024.))
+#        print >> file,  "    Memory saved if view are used: %dKB" % int(round(
+#            max_node_memory_saved_by_view / 1024.))
+#        print >> file,  "    Memory saved if inplace op are used: %dKB" % int(
+#            round(max_node_memory_saved_by_inplace / 1024.))
+        print >> file,  "    Memory saved if gc is enabled (linker=c|py): %dKB" % int(
+            round(max_node_memory_size - max_running_max_memory_size) / 1024.)
+        if (hasattr(theano, 'sandbox') and
+            hasattr(theano.sandbox, 'cuda') and
+            hasattr(theano.sandbox.cuda, 'cuda_ndarray') and
+            hasattr(theano.sandbox.cuda.cuda_ndarray.cuda_ndarray,
+                    'theano_allocated')):
+            _, gpu_max = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
+            print >> file,  ("    Max Memory allocated on the GPU"
+                             "(for all functions): %dKB" %
+                             int(round(gpu_max / 1024.)))
+
+        print >> file, ""
+        if len(fct_memory) > 1:
+            print >> file,  (
+                "    This list is based on all functions in the profile")
+        print >> file,  ("    <Sum apply outputs (bytes)>"
+                         " <Apply outputs shape>"
+                         " <created/inplace/view>"
+                         " <Apply node>")
+        print >> file, ""
+        items = node_mem.items()
+        items.sort(key=lambda a: a[1])
+        items.reverse()
+        for idx, (node, node_outputs_size) in enumerate(items[:N]):
+            code = ['c'] * len(node.outputs)
+            for out, inp in getattr(node.op, 'destroy_map', {}).iteritems():
+                code[out] = "i"
+            for out, inp in getattr(node.op, 'view_map', {}).iteritems():
+                code[out] = "v"
+            shapes = str(fct_shapes[node.fgraph][node])
+
+            if all([hasattr(out.type, 'get_size')
+                    for out in node.outputs]):
+                size = "%9dB" % node_outputs_size
+                if node_outputs_size < config.profiling.min_memory_size:
+                    N = idx
+                    break
+            else:
+                size = "%10s" % "Unknown"
+
+            print >> file,  '     %s  %s %s %s' % (size,
+                                                   shapes,
+                                                   ' '.join(code), node)
+
+        sum_remaining = sum(size for _, size in items[N:])
+        size_sum_dense = sum(node_mem.values())
+        if size_sum_dense == 0:
+            p = "0%"
+        else:
+            p = "(%.2f%%)" % (float(sum_remaining) / size_sum_dense * 100)
+        print >> file,  (
+            '   ... (remaining %i Apply account for %4dB/%dB (%s) of the'
+            ' Apply with dense outputs sizes)') % (max(0, len(node_mem) - N),
+                                                       sum_remaining,
+                                                       size_sum_dense, p
+                                                   )
+        print >> file, ''
+        if N == 0:
+            print >> file, ('    All Apply node have outputs size that take'
+                            ' less then %dB.' %
+                            config.profiling.min_memory_size)
+        print >> file,  (
+            "    <created/inplace/view> is taked from the op declaration.")
+        print >> file,  ("    Apply nodes marked 'inplace' or 'view' may"
+                         " actually allocate memory, this is not reported"
+                         " here. If you use DebugMode, warnings will be"
+                         " emitted in those cases.")
+        print >> file, ''
+
    def summary(self, file=sys.stderr, n_ops_to_print=20,
-                n_applies_to_print=20):
+                n_apply_to_print=20):
        self.summary_function(file)
        local_time = sum(self.apply_time.values())
        if local_time > 0:
            self.summary_class(file, n_ops_to_print)
            self.summary_ops(file, n_ops_to_print)
-            self.summary_nodes(file, n_applies_to_print)
+            self.summary_nodes(file, n_apply_to_print)
        elif self.fct_callcount > 0:
-            print >> file, ("  No node time accumulated "
+            print >> file, ("  No execution time accumulated "
                            "(hint: try config profiling.time_thunks=1)")
+        if config.profile_memory:
+            self.summary_memory(file, n_apply_to_print)
        if self.optimizer_profile:
            print "Optimizer Profile"
            print "-----------------"
-            self.optimizer_profile[0].print_profile(file, self.optimizer_profile[1])
+            self.optimizer_profile[0].print_profile(file,
+                                                    self.optimizer_profile[1])


 if 0: # old code still to be ported from ProfileMode
@@ -701,80 +945,6 @@ if 0: # old code still to be ported from ProfileMode
                    if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
                        print fct.name, i.name, i.type, i

-        if outputs_size:
-            fct_memory={}#fgraph->dict(node->(outputs size))
-            var_mem = {}
-            for node,val in outputs_size.items():
-                fct_memory.setdefault(node.fgraph,{})
-                fct_memory[node.fgraph][node]=val
-                for out,v in zip(node.outputs,val):
-                    var_mem[out]=v
-            print
-            print "Profile of Theano functions memory:"
-            for fgraph, nodes_mem in fct_memory.iteritems():
-                print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.fgraph is fgraph][0].name
-                size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
-                print "    Max without gc, inplace and view (KB)",size_sum/1024
-
-                node_memory_size = 0
-                node_memory_saved_by_view = 0
-                node_memory_saved_by_inplace = 0
-                running_memory_size = 0
-                running_max_memory_size = 0
-                post_thunk_old_storage = []
-                items = nodes_mem.items()
-                items.sort(key=lambda a: a[1])
-                items.reverse()
-
-                order = fgraph.toposort()
-                computed, last_user = gc_helper(order)
-                for node in order:
-                    post_thunk_old_storage.append([ input_idx
-                                                    for input_idx,input in enumerate(node.inputs)
-                                                    if (input in computed) and (input not in fgraph.outputs) and node == last_user[input]])
-                for node,val in items[:n_apply_to_print]:
-                    dmap = getattr(node.op,'destroy_map',None)
-                    vmap = getattr(node.op,'view_map',None)
-
-                    for idx,v in enumerate(val):
-                        if dmap and idx in dmap:#TODO check the op returned a view
-                            node_memory_saved_by_inplace += v
-                        elif vmap and idx in vmap:#TODO check the op returned a view
-                            node_memory_saved_by_view += v
-                        else:
-                            node_memory_size += v
-                            running_memory_size += v
-                            if running_memory_size > running_max_memory_size:
-                                running_max_memory_size = running_memory_size
-                            old_storage = post_thunk_old_storage[order.index(node)]
-                            for old_s in old_storage:
-                                running_memory_size -= var_mem[node.inputs[old_s]]
-                                pass
-                    pass
-
-                print "    Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
-                print "    Max FAST_RUN (KB)", running_max_memory_size/1024
-                print "    Memory saved by view (KB)", node_memory_saved_by_view/1024
-                print "    Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
-                print "    Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
-
-                n_apply_to_print+=10#TODO remove this line
-                print "    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
-                print "    <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
-                for key,val in items[:n_apply_to_print]:
-                    code = ['c']*len(node.outputs)
-                    for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
-                        code[out] = "i"
-                    for out,inp in getattr(key.op,'view_map',{}).iteritems():
-                        code[out] = "v"
-                    print '       %9dB  %s %s %s' % (sum(val), str(val), ' '.join(code), key)
-
-                print '   ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
-                %(max(0, len(nodes_mem)-n_ops_to_print),
-                  sum(sum(val) for key, val in items[n_ops_to_print:]),
-                  sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
-
-
        print
        print "Here are tips to potentially make your code run faster (if you think of new ones, suggest them on the mailing list). Test them first as they are not guaranteed to always provide a speedup."
        from theano import tensor as T

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -343,6 +343,15 @@ class PureType(object):
        """
        return self.values_eq(a, b)

+#    def get_shape_info(self, obj):
+        """
+        Optional function. See TensorType().get_shape_info for definition
+        """
+
+#    def get_size(self, shape_info):
+        """
+        Optional function. See TensorType().get_size for definition
+        """

 _nothing = """
       """

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -5,6 +5,7 @@ VM was a better name at some point.
 """
 import link
 import logging
+import os
 import sys
 import time
 import warnings
@@ -23,6 +24,9 @@ AddConfigVar('profile',
 AddConfigVar('profile_optimizer',
        "If VM should collect optimizer profile information",
        BoolParam(False))
+AddConfigVar('profile_memory',
+        "If VM should collect memory profile information and print it",
+        BoolParam(False))


 def filter_vm_lazy(val):
@@ -130,6 +134,10 @@ class VM(object):

            profile.apply_cimpl[node] = hasattr(thunk, 'cthunk')

+        if hasattr(self, 'variable_shape'):
+            profile.variable_shape = self.variable_shape.copy()
+            profile.variable_strides = self.variable_strides.copy()
+
        # clear the timer info out of the buffers
        for i in xrange(len(self.call_times)):
            self.call_times[i] = 0.0
@@ -244,8 +252,8 @@ class Stack(VM):
        self.base_apply_stack = [o.owner for o in fgraph.outputs if o.owner]
        self.outputs = fgraph.outputs
        self.storage_map = storage_map
-        self.apply_time = {}
-        self.outputs_size = {}
+        self.variable_shape = {}  # Variable -> shape
+        self.variable_strides = {}  # Variable -> strides
        self.compute_map = compute_map
        self.node_idx = node_idx = {}
        self.callback = callback
@@ -254,8 +262,7 @@ class Stack(VM):

        for i, node in enumerate(self.nodes):
            node_idx[node] = i
-            self.apply_time[node] = 0
-            self.outputs_size[node] = []
+
            # XXX: inconsistent style - why modify node here rather
            #      than track destroy_dependencies with dictionary like
            #      storage_map?
@@ -280,10 +287,6 @@ class Stack(VM):
        if self.allow_gc and self.dependencies is None:
            raise ValueError("Must set dependencies when using GC")

-        if config.profile:
-            self.memory_size_map = {"nt8": 1, "t16": 2, "t32": 4,
-                                    "t64": 8, "128": 16}
-            atexit.register(self.atexit_print_all)

    def run_thunk_of_node(self, node):
        """Run the thunk corresponding to Apply instance `node`
@@ -319,7 +322,24 @@ class Stack(VM):
        # apply_stack contains nodes
        apply_stack = list(self.base_apply_stack)
        last_apply_stack_len = -1
-        ls = []
+
+        #This record all function inputs/shared varibles and constants
+        for var, data in self.storage_map.iteritems():
+            if data[0] is None:
+                continue
+            if hasattr(var.type, 'get_shape_info'):
+                sh = var.type.get_shape_info(data[0])
+            else:
+                sh = 'input no shape'
+            self.variable_shape[var] = sh
+            st = getattr(data[0], 'strides', 'input no strides')
+            if getattr(data[0], 'flags', False) and data[0].flags.c_contiguous:
+                st = 'c'
+            elif (hasattr(data[0], 'is_c_contiguous') and
+                  data[0].is_c_contiguous()):
+                st = "c"
+            self.variable_strides[var] = st
+
        while apply_stack:
            # Make sure something happened last time round.  This is
            # just a safety check to make sure the op is written
@@ -356,25 +376,27 @@ class Stack(VM):
                        _, dt = self.run_thunk_of_node(current_apply)
                        del _
                        if config.profile:
-                            self.apply_time[current_apply] += dt
+                            current_idx = self.node_idx[current_apply]
+                            self.call_counts[current_idx] += 1
+                            self.call_times[current_idx] += dt
                            ## Computing the memory footprint of the the op
                            # ?? What about inplace .. if the op is inplace
                            # you don't actually ask for more memory!
-                            size = []
                            for (idx, o) in enumerate(
                                    thunks[self.node_idx[
                                        current_apply]].outputs):
-                                if not hasattr(o[0], 'size'):
-                                    size.append(-1)
-                                    continue
-                                s = o[0].size
-                                dtype = str(o[0].dtype)
-                                dtype2 = dtype[-3:]
-                                # KeyError here: couldn't determine
-                                # the dtype memory size
-                                s *= self.memory_size_map[dtype2]
-                                size.append(s)
-                            self.outputs_size[current_apply] = size
+                                var = self.nodes[current_idx].outputs[idx]
+                                if hasattr(var.type, 'get_shape_info'):
+                                    sh = var.type.get_shape_info(o[0])
+                                else:
+                                    sh = 'input no shape'
+                                self.variable_shape[var] = sh
+                                st = getattr(o[0], 'strides',
+                                             'input no strides')
+                                if (getattr(o[0], 'flags', False) and
+                                    o[0].flags.c_contiguous):
+                                    st = 'c'
+                                self.variable_strides[var] = st
                    except Exception:
                        raise_with_op(current_apply)
                    for o in current_apply.outputs:
@@ -429,7 +451,9 @@ class Stack(VM):

                try:
                    requires, dt = self.run_thunk_of_node(current_apply)
-                    self.apply_time[current_apply] += dt
+                    current_idx = self.node_idx[current_apply]
+                    self.call_counts[current_idx] += 1
+                    self.call_times[current_idx] += dt

                except Exception:
                    raise_with_op(current_apply)
@@ -444,20 +468,22 @@ class Stack(VM):
                            apply_stack.append(current_apply.inputs[r].owner)
                else:
                    if config.profile:
-                        size = []
                        for (idx, o) in enumerate(thunks[
                                self.node_idx[current_apply]].outputs):
-                            if not hasattr(o[0], 'size'):
-                                size.append(-1)
-                                continue
-                            s = o[0].size
-                            dtype = str(o[0].dtype)
-                            dtype2 = dtype[-2:]
-                            # KeyError here: couldn't determine the
-                            # dtype memory size
-                            s *= self.memory_size_map[dtype2]
-                            size.append(s)
-                        self.outputs_size[current_apply] = size
+                            var = self.nodes[
+                                self.node_idx[current_apply]].outputs[idx]
+
+                            if hasattr(var.type, 'get_shape_info'):
+                                sh = var.type.get_shape_info(o[0])
+                            else:
+                                sh = 'input no shape'
+                            self.variable_shape[var] = sh
+                            st = getattr(o[0], 'strides', 'input no strides')
+                            if (getattr(o[0], 'flags', False) and
+                                o[0].flags.c_contiguous):
+                                st = 'c'
+                            self.variable_strides[var] = st
+
                    if self.allow_gc:
                        for i in current_apply.inputs:
                            if (dependencies[i] and i.owner and
@@ -550,6 +576,19 @@ class VM_Linker(link.LocalLinker):
        :returns: self if fgraph is the first FunctionGraph that has ever been
            associated to self, else, a new VM_Linker associated to fgraph.
        """
+        if (config.profile and
+            hasattr(theano, 'sandbox') and
+            hasattr(theano.sandbox, 'cuda') and
+            theano.sandbox.cuda.cuda_enabled):
+            if os.environ.get('CUDA_LAUNCH_BLOCKING', '0') != '1':
+                raise Exception(
+                    "You are running Theano profiler with CUDA enabled."
+                    " Theano GPU ops execution are asynchron by default."
+                    " So by default, the profile is useless."
+                    " You must use set the environment variable"
+                    " CUDA_LAUNCH_BLOCKING to 1 to tell the CUDA drvier to"
+                    " synchonize the execution to get meaning full profile.")
+
        if no_recycling is None:
            no_recycling = []
        if self.fgraph is not None and self.fgraph is not fgraph:
@@ -606,7 +645,6 @@ class VM_Linker(link.LocalLinker):
            # XXX if k has no clients... what is it doing in the computation?
            if k.owner and k.clients:
                ls = []
-                is_output = 0
                for cl in k.clients:
                    if cl[0] is not 'output':
                        ls += cl[0].outputs
@@ -623,9 +661,12 @@ class VM_Linker(link.LocalLinker):

        pre_call_clear = [storage_map[v] for v in self.no_recycling]

-        if self.callback is not None:
-            if self.use_cloop:
-                logger.warn('CLoop does not support callback, using Stack VM.')
+        if self.callback is not None or (config.profile and config.profile_memory):
+            if self.use_cloop and self.callback is not None:
+                logger.warn('CVM does not support callback, using Stack VM.')
+            if self.use_cloop and config.profile_memory:
+                warnings.warn(
+                    'CVM does not support memory profile, using Stack VM.')
            deps = None
            if self.allow_gc:
                deps = self.compute_gc_dependencies(storage_map)

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
                                 order=order))
    c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
                                 order=order))
-    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))],
-                        mode=theano.compile.ProfileMode())
+    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

    if any([x.op.__class__.__name__ == 'Gemm' for x in
            f.maker.fgraph.toposort()]):
-        c_impl = f.profile.apply_cimpl.values()
+        c_impl = [hasattr(thunk, 'cthunk')
+                  for node, thunk in zip(f.fn.nodes, f.fn.thunks)
+                  if node.op.__class__.__name__ == "Gemm"]
        assert len(c_impl) == 1
        if c_impl[0]:
            impl = 'CPU (with direct Theano binding to blas)'

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
 *
 */
 int _outstanding_mallocs[] = {0,0};
+
 #if COMPUTE_GPU_MEM_USED
 int _allocated_size = 0;
+int _max_allocated_size = 0;
+
 const int TABLE_SIZE = 10000;
 struct table_struct{
    void* ptr;
@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose)
                "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
        return NULL;
    }
-    _outstanding_mallocs[0] += (rval != NULL);
-    #if COMPUTE_GPU_MEM_USED
+    if (rval != NULL){
+        // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
+        // Could this be what happen if size is 0?
+        _outstanding_mallocs[0] += 1;
+
+#if COMPUTE_GPU_MEM_USED
+        _allocated_size += size;
+        _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
+
        for(int i=0;i<TABLE_SIZE;i++){
            if(NULL==_alloc_size_table[i].ptr){
                _alloc_size_table[i].ptr=rval;
@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose)
                break;
            }
        }
-        _allocated_size += size;
-    #endif
+#endif
+    }
    //fprintf(stderr,
    //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
    //(long)size, cudaGetErrorString(err),_allocated_size,rval);
@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
    Py_INCREF(Py_None);
    return Py_None;
 }
+
 #if COMPUTE_GPU_MEM_USED
 /*
 * Return the size in bytes that Theano currently have allocated on the gpu.
@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
 PyObject *
 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
 {
-    return PyLong_FromLong(_allocated_size);
+    PyObject* a = PyLong_FromLong(_allocated_size);
+    PyObject* b = PyLong_FromLong(_max_allocated_size);
+
+    PyObject* tuple = PyTuple_New(2);
+    PyTuple_SetItem(tuple, 0, a);
+    PyTuple_SetItem(tuple, 1, b);
+    return tuple;
 }
 #endif

@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = {
        (setter)CudaNdarray_set_strides,
        "data pointer strides (in elements)",
        NULL},
+    {"strides",
+        (getter)CudaNdarray_get_strides,
+        (setter)CudaNdarray_set_strides,
+        "data pointer strides (in elements)",
+        NULL},
    //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
    {"gpudata",
        (getter)CudaNdarray_get_dev_data,

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -417,6 +417,15 @@ class CudaNdarrayType(Type):
    def c_compile_args(self):
        return []

+    def get_shape_info(self, obj):
+        return obj.shape
+
+    def get_size(self, shape_info):
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:  # a scalar
+            return numpy.dtype(self.dtype).itemsize
+
 theano.compile.ops.expandable_types += (CudaNdarrayType,)

 # Register C code for ViewOp on CudaNdarrayType

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -423,6 +423,12 @@ class Scalar(Type):
        return (4,)  # explicit T given in specialization of operator=
                     # lines.  This makes it compile with open64

+    def get_shape_info(self, obj):
+        return obj.itemsize
+
+    def get_size(self, shape_info):
+        return shape_info
+
 # Register C code for ViewOp on Scalars.
 theano.compile.register_view_op_c_code(
        Scalar,
@@ -460,6 +466,9 @@ class _scalar_py_operators:
    # variables and Tensor variables
    ndim = 0

+    dtype = property(lambda self: self.type.dtype)
+    """ The dtype of this scalar.  """
+
    #UNARY
    def __abs__(self):
        return abs_(self)

--- a/theano/sparse/type.py
+++ b/theano/sparse/type.py
@@ -147,6 +147,17 @@ class SparseType(gof.Type):
    def is_valid_value(self, a):
        return scipy.sparse.issparse(a) and (a.format == self.format)

+    def get_shape_info(self, obj):
+        obj = self.filter(obj)
+        assert obj.indices.dtype == 'int32'
+        assert obj.indptr.dtype == 'int32'
+        return (obj.shape, obj.data.size,
+                obj.indices.size, obj.indptr.size, obj.nnz)
+
+    def get_size(self, shape_info):
+        return (shape_info[1] * numpy.dtype(self.dtype).itemsize +
+                (shape_info[2] + shape_info[3]) * numpy.dtype('int32').itemsize)
+
 # Register SparseType's C code for ViewOp.
 theano.compile.register_view_op_c_code(
        SparseType,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1198,6 +1198,38 @@ class TensorType(Type):
        """
        return numpy.zeros(shape, dtype=self.dtype)

+    def get_shape_info(self, obj):
+        """Return the information needed to compute the memory size of obj.
+
+
+        The memory size is only the data, so this exclude the container.
+        For an ndarray, this is the data, but not the ndarray object and
+        others data structures as shape and strides.
+
+        get_shape_info() and get_size() work in tendem for the memory profiler.
+
+        get_shape_info() is called during the execution of the function.
+        So it is better that it is not too slow.
+
+        get_size() will be called with the output of this function
+        when printing the memory profile.
+
+        :param obj: The object that this Type represent during execution
+        :return: Python object that self.get_size() understand
+        """
+        return obj.shape
+
+    def get_size(self, shape_info):
+        """ Number of bytes taken by the object represented by shape_info
+
+        :param shape_info: the output of the call to get_shape_info()
+        :return: the number of bytes taken by the object described in
+          shape_info.
+        """
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:  # a scalar
+            return numpy.dtype(self.dtype).itemsize
 theano.compile.ops.expandable_types += (TensorType,)

 # Register TensorType C code for ViewOp.
@@ -5158,8 +5190,8 @@ def batched_dot(x, y):
    iterating over the first dimension using scan.
    Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
    Example:
-    >>> first = T.tensor3('first')
-    >>> second = T.tensor3('second')
+    >>> first = tensor.tensor3('first')
+    >>> second = tensor.tensor3('second')
    >>> result = batched_dot(first, second)
    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
    But numpy einsum is slower than dot or tensordot:

--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -56,6 +56,25 @@ class RandomStateType(gof.Type):
                return False
        return True

+    def get_shape_info(self, obj):
+        return None
+
+    def get_size(self, shape_info):
+        # The size is the data, that have constant size.
+        state = numpy.random.RandomState().get_state()
+        size = 0
+        for elem in state:
+            if isinstance(elem, str):
+                size += len(elem)
+            elif isinstance(elem, numpy.ndarray):
+                size += elem.size * elem.itemsize
+            elif isinstance(elem, int):
+                size += numpy.dtype("int").itemsize
+            elif isinstance(elem, float):
+                size += numpy.dtype("float").itemsize
+            else:
+                raise NotImplementedError()
+        return size
 # Register RandomStateType's C code for ViewOp.
 theano.compile.register_view_op_c_code(
        RandomStateType,