Merge pull request #1277 from nouiz/memory_prof

Memory profiler in the new profiler

Merge pull request #1277 from nouiz/memory_prof
91a31814 · lamblin · c4a3bd9f · 33ded753 · 91a31814 · 91a31814
--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -106,6 +106,38 @@ default values.

      *Default:* ``id(self)``

+    .. method:: get_shape_info(obj)
+
+      Optional. Only needed to profile the memory of this Type of object
+
+      Return the information needed to compute the memory size of obj.
+
+      The memory size is only the data, so this exclude the container.
+      For an ndarray, this is the data, but not the ndarray object and
+      others data structures as shape and strides.
+
+      get_shape_info() and get_size() work in tendem for the memory profiler.
+
+      get_shape_info() is called during the execution of the function.
+      So it is better that it is not too slow.
+
+      get_size() will be called with the output of this function
+      when printing the memory profile.
+
+      :param obj: The object that this Type represent during execution
+      :return: Python object that self.get_size() understand
+
+
+    .. method:: get_size(shape_info)
+
+        Number of bytes taken by the object represented by shape_info
+
+        Optional. Only needed to profile the memory of this Type of object
+
+	:param shape_info: the output of the call to get_shape_info()
+        :return: the number of bytes taken by the object described in
+          shape_info.
+        """
 For each method, the *default* is what ``Type`` defines
 for you. So, if you create an instance of ``Type`` or an
 instance of a subclass of ``Type``, you

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -273,6 +273,15 @@ import theano and print the config variable, as in:

    Do the vm/cvm linkers profile the execution of Theano functions?

+.. attribute:: profile_memory
+
+    Bool value: either True or False
+
+    Default False
+
+    Do the vm/cvm linkers profile the memory of Theano functions get printed?
+    It only work when profile=True.
+
 .. attribute:: profile_optimizer

    Bool value: either True or False
@@ -280,6 +289,26 @@ import theano and print the config variable, as in:
    Default False

    Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
+    It only work when profile=True.
+
+.. attribute:: profiling.n_apply
+
+    Positive int value, default: 20.
+
+    The number of apply node to print in the profiler output
+
+.. attribute:: profiling.n_ops
+
+    Positive int value, default: 20.
+
+    The number of ops to print in the profiler output
+
+.. attribute:: profiling.min_memory_size
+
+    Positive int value, default: 1024.
+
+    For the memory profile, do not print apply nodes if the size
+    of their outputs (in bytes) is lower then this.

 .. attribute:: config.lib.amdlibm


--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -60,7 +60,7 @@ from theano.compile import \
    FunctionMaker, function, OpFromGraph, \
    Component, External, Member, Method, \
    Composite, ComponentList, ComponentDict, Module, \
-    ProfileMode, \
+    ProfileMode, ProfileStats, \
    Param, shared

 from theano.misc.safe_asarray import _asarray

--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode

 from theano.compile.monitormode import MonitorMode

+from theano.compile.profiling import ProfileStats, ScanProfileStats
+
 from theano.compile.profilemode import ProfileMode

-from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable
+from theano.compile.sharedvalue import (shared, shared_constructor,
+                                        SharedVariable)
 from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared

 from theano.compile.function import function
-
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -343,6 +343,15 @@ class PureType(object):
        """
        return self.values_eq(a, b)

+#    def get_shape_info(self, obj):
+        """
+        Optional function. See TensorType().get_shape_info for definition
+        """
+
+#    def get_size(self, shape_info):
+        """
+        Optional function. See TensorType().get_size for definition
+        """

 _nothing = """
       """

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
                                 order=order))
    c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
                                 order=order))
-    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))],
-                        mode=theano.compile.ProfileMode())
+    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])

    if any([x.op.__class__.__name__ == 'Gemm' for x in
            f.maker.fgraph.toposort()]):
-        c_impl = f.profile.apply_cimpl.values()
+        c_impl = [hasattr(thunk, 'cthunk')
+                  for node, thunk in zip(f.fn.nodes, f.fn.thunks)
+                  if node.op.__class__.__name__ == "Gemm"]
        assert len(c_impl) == 1
        if c_impl[0]:
            impl = 'CPU (with direct Theano binding to blas)'

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
 *
 */
 int _outstanding_mallocs[] = {0,0};
+
 #if COMPUTE_GPU_MEM_USED
 int _allocated_size = 0;
+int _max_allocated_size = 0;
+
 const int TABLE_SIZE = 10000;
 struct table_struct{
    void* ptr;
@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose)
                "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
        return NULL;
    }
-    _outstanding_mallocs[0] += (rval != NULL);
-    #if COMPUTE_GPU_MEM_USED
+    if (rval != NULL){
+        // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
+        // Could this be what happen if size is 0?
+        _outstanding_mallocs[0] += 1;
+
+#if COMPUTE_GPU_MEM_USED
+        _allocated_size += size;
+        _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
+
        for(int i=0;i<TABLE_SIZE;i++){
            if(NULL==_alloc_size_table[i].ptr){
                _alloc_size_table[i].ptr=rval;
@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose)
                break;
            }
        }
-        _allocated_size += size;
-    #endif
+#endif
+    }
    //fprintf(stderr,
    //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
    //(long)size, cudaGetErrorString(err),_allocated_size,rval);
@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
    Py_INCREF(Py_None);
    return Py_None;
 }
+
 #if COMPUTE_GPU_MEM_USED
 /*
 * Return the size in bytes that Theano currently have allocated on the gpu.
@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
 PyObject *
 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
 {
-    return PyLong_FromLong(_allocated_size);
+    PyObject* a = PyLong_FromLong(_allocated_size);
+    PyObject* b = PyLong_FromLong(_max_allocated_size);
+
+    PyObject* tuple = PyTuple_New(2);
+    PyTuple_SetItem(tuple, 0, a);
+    PyTuple_SetItem(tuple, 1, b);
+    return tuple;
 }
 #endif

@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = {
        (setter)CudaNdarray_set_strides,
        "data pointer strides (in elements)",
        NULL},
+    {"strides",
+        (getter)CudaNdarray_get_strides,
+        (setter)CudaNdarray_set_strides,
+        "data pointer strides (in elements)",
+        NULL},
    //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
    {"gpudata",
        (getter)CudaNdarray_get_dev_data,

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -417,6 +417,15 @@ class CudaNdarrayType(Type):
    def c_compile_args(self):
        return []

+    def get_shape_info(self, obj):
+        return obj.shape
+
+    def get_size(self, shape_info):
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:  # a scalar
+            return numpy.dtype(self.dtype).itemsize
+
 theano.compile.ops.expandable_types += (CudaNdarrayType,)

 # Register C code for ViewOp on CudaNdarrayType

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -423,6 +423,12 @@ class Scalar(Type):
        return (4,)  # explicit T given in specialization of operator=
                     # lines.  This makes it compile with open64

+    def get_shape_info(self, obj):
+        return obj.itemsize
+
+    def get_size(self, shape_info):
+        return shape_info
+
 # Register C code for ViewOp on Scalars.
 theano.compile.register_view_op_c_code(
        Scalar,
@@ -460,6 +466,9 @@ class _scalar_py_operators:
    # variables and Tensor variables
    ndim = 0

+    dtype = property(lambda self: self.type.dtype)
+    """ The dtype of this scalar.  """
+
    #UNARY
    def __abs__(self):
        return abs_(self)

--- a/theano/sparse/type.py
+++ b/theano/sparse/type.py
@@ -147,6 +147,17 @@ class SparseType(gof.Type):
    def is_valid_value(self, a):
        return scipy.sparse.issparse(a) and (a.format == self.format)

+    def get_shape_info(self, obj):
+        obj = self.filter(obj)
+        assert obj.indices.dtype == 'int32'
+        assert obj.indptr.dtype == 'int32'
+        return (obj.shape, obj.data.size,
+                obj.indices.size, obj.indptr.size, obj.nnz)
+
+    def get_size(self, shape_info):
+        return (shape_info[1] * numpy.dtype(self.dtype).itemsize +
+                (shape_info[2] + shape_info[3]) * numpy.dtype('int32').itemsize)
+
 # Register SparseType's C code for ViewOp.
 theano.compile.register_view_op_c_code(
        SparseType,

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1198,6 +1198,38 @@ class TensorType(Type):
        """
        return numpy.zeros(shape, dtype=self.dtype)

+    def get_shape_info(self, obj):
+        """Return the information needed to compute the memory size of obj.
+
+
+        The memory size is only the data, so this exclude the container.
+        For an ndarray, this is the data, but not the ndarray object and
+        others data structures as shape and strides.
+
+        get_shape_info() and get_size() work in tendem for the memory profiler.
+
+        get_shape_info() is called during the execution of the function.
+        So it is better that it is not too slow.
+
+        get_size() will be called with the output of this function
+        when printing the memory profile.
+
+        :param obj: The object that this Type represent during execution
+        :return: Python object that self.get_size() understand
+        """
+        return obj.shape
+
+    def get_size(self, shape_info):
+        """ Number of bytes taken by the object represented by shape_info
+
+        :param shape_info: the output of the call to get_shape_info()
+        :return: the number of bytes taken by the object described in
+          shape_info.
+        """
+        if shape_info:
+            return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
+        else:  # a scalar
+            return numpy.dtype(self.dtype).itemsize
 theano.compile.ops.expandable_types += (TensorType,)

 # Register TensorType C code for ViewOp.
@@ -5158,8 +5190,8 @@ def batched_dot(x, y):
    iterating over the first dimension using scan.
    Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
    Example:
-    >>> first = T.tensor3('first')
-    >>> second = T.tensor3('second')
+    >>> first = tensor.tensor3('first')
+    >>> second = tensor.tensor3('second')
    >>> result = batched_dot(first, second)
    :note:  This is a subset of numpy.einsum, but we do not provide it for now.
    But numpy einsum is slower than dot or tensordot:

--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -56,6 +56,25 @@ class RandomStateType(gof.Type):
                return False
        return True

+    def get_shape_info(self, obj):
+        return None
+
+    def get_size(self, shape_info):
+        # The size is the data, that have constant size.
+        state = numpy.random.RandomState().get_state()
+        size = 0
+        for elem in state:
+            if isinstance(elem, str):
+                size += len(elem)
+            elif isinstance(elem, numpy.ndarray):
+                size += elem.size * elem.itemsize
+            elif isinstance(elem, int):
+                size += numpy.dtype("int").itemsize
+            elif isinstance(elem, float):
+                size += numpy.dtype("float").itemsize
+            else:
+                raise NotImplementedError()
+        return size
 # Register RandomStateType's C code for ViewOp.
 theano.compile.register_view_op_c_code(
        RandomStateType,