提交 91a31814 authored 作者: lamblin's avatar lamblin

Merge pull request #1277 from nouiz/memory_prof

Memory profiler in the new profiler
...@@ -106,6 +106,38 @@ default values. ...@@ -106,6 +106,38 @@ default values.
*Default:* ``id(self)`` *Default:* ``id(self)``
.. method:: get_shape_info(obj)
Optional. Only needed to profile the memory of this Type of object
Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
.. method:: get_size(shape_info)
Number of bytes taken by the object represented by shape_info
Optional. Only needed to profile the memory of this Type of object
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
For each method, the *default* is what ``Type`` defines For each method, the *default* is what ``Type`` defines
for you. So, if you create an instance of ``Type`` or an for you. So, if you create an instance of ``Type`` or an
instance of a subclass of ``Type``, you instance of a subclass of ``Type``, you
......
...@@ -273,6 +273,15 @@ import theano and print the config variable, as in: ...@@ -273,6 +273,15 @@ import theano and print the config variable, as in:
Do the vm/cvm linkers profile the execution of Theano functions? Do the vm/cvm linkers profile the execution of Theano functions?
.. attribute:: profile_memory
Bool value: either True or False
Default False
Do the vm/cvm linkers profile the memory of Theano functions get printed?
It only work when profile=True.
.. attribute:: profile_optimizer .. attribute:: profile_optimizer
Bool value: either True or False Bool value: either True or False
...@@ -280,6 +289,26 @@ import theano and print the config variable, as in: ...@@ -280,6 +289,26 @@ import theano and print the config variable, as in:
Default False Default False
Do the vm/cvm linkers profile the optimization phase when compiling a Theano function? Do the vm/cvm linkers profile the optimization phase when compiling a Theano function?
It only work when profile=True.
.. attribute:: profiling.n_apply
Positive int value, default: 20.
The number of apply node to print in the profiler output
.. attribute:: profiling.n_ops
Positive int value, default: 20.
The number of ops to print in the profiler output
.. attribute:: profiling.min_memory_size
Positive int value, default: 1024.
For the memory profile, do not print apply nodes if the size
of their outputs (in bytes) is lower then this.
.. attribute:: config.lib.amdlibm .. attribute:: config.lib.amdlibm
......
...@@ -60,7 +60,7 @@ from theano.compile import \ ...@@ -60,7 +60,7 @@ from theano.compile import \
FunctionMaker, function, OpFromGraph, \ FunctionMaker, function, OpFromGraph, \
Component, External, Member, Method, \ Component, External, Member, Method, \
Composite, ComponentList, ComponentDict, Module, \ Composite, ComponentList, ComponentDict, Module, \
ProfileMode, \ ProfileMode, ProfileStats, \
Param, shared Param, shared
from theano.misc.safe_asarray import _asarray from theano.misc.safe_asarray import _asarray
......
...@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode ...@@ -16,10 +16,12 @@ from theano.compile.debugmode import DebugMode
from theano.compile.monitormode import MonitorMode from theano.compile.monitormode import MonitorMode
from theano.compile.profiling import ProfileStats, ScanProfileStats
from theano.compile.profilemode import ProfileMode from theano.compile.profilemode import ProfileMode
from theano.compile.sharedvalue import shared, shared_constructor, SharedVariable from theano.compile.sharedvalue import (shared, shared_constructor,
SharedVariable)
from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
from theano.compile.function import function from theano.compile.function import function
...@@ -343,6 +343,15 @@ class PureType(object): ...@@ -343,6 +343,15 @@ class PureType(object):
""" """
return self.values_eq(a, b) return self.values_eq(a, b)
# def get_shape_info(self, obj):
"""
Optional function. See TensorType().get_shape_info for definition
"""
# def get_size(self, shape_info):
"""
Optional function. See TensorType().get_size for definition
"""
_nothing = """ _nothing = """
""" """
......
差异被折叠。
...@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -68,12 +68,13 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
order=order)) order=order))
c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX, c = theano.shared(numpy.ones((M, K), dtype=theano.config.floatX,
order=order)) order=order))
f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))], f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])
mode=theano.compile.ProfileMode())
if any([x.op.__class__.__name__ == 'Gemm' for x in if any([x.op.__class__.__name__ == 'Gemm' for x in
f.maker.fgraph.toposort()]): f.maker.fgraph.toposort()]):
c_impl = f.profile.apply_cimpl.values() c_impl = [hasattr(thunk, 'cthunk')
for node, thunk in zip(f.fn.nodes, f.fn.thunks)
if node.op.__class__.__name__ == "Gemm"]
assert len(c_impl) == 1 assert len(c_impl) == 1
if c_impl[0]: if c_impl[0]:
impl = 'CPU (with direct Theano binding to blas)' impl = 'CPU (with direct Theano binding to blas)'
......
...@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure); ...@@ -44,8 +44,11 @@ static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
* *
*/ */
int _outstanding_mallocs[] = {0,0}; int _outstanding_mallocs[] = {0,0};
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
int _allocated_size = 0; int _allocated_size = 0;
int _max_allocated_size = 0;
const int TABLE_SIZE = 10000; const int TABLE_SIZE = 10000;
struct table_struct{ struct table_struct{
void* ptr; void* ptr;
...@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose) ...@@ -82,8 +85,15 @@ void * device_malloc(size_t size, int verbose)
"Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err)); "Error allocating %li bytes of device memory (%s).", (long)size, cudaGetErrorString(err));
return NULL; return NULL;
} }
_outstanding_mallocs[0] += (rval != NULL); if (rval != NULL){
#if COMPUTE_GPU_MEM_USED // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
// Could this be what happen if size is 0?
_outstanding_mallocs[0] += 1;
#if COMPUTE_GPU_MEM_USED
_allocated_size += size;
_max_allocated_size = std::max(_max_allocated_size, _allocated_size);
for(int i=0;i<TABLE_SIZE;i++){ for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){ if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval; _alloc_size_table[i].ptr=rval;
...@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose) ...@@ -91,8 +101,8 @@ void * device_malloc(size_t size, int verbose)
break; break;
} }
} }
_allocated_size += size; #endif
#endif }
//fprintf(stderr, //fprintf(stderr,
//"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n", //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
//(long)size, cudaGetErrorString(err),_allocated_size,rval); //(long)size, cudaGetErrorString(err),_allocated_size,rval);
...@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) ...@@ -2507,6 +2517,7 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
#if COMPUTE_GPU_MEM_USED #if COMPUTE_GPU_MEM_USED
/* /*
* Return the size in bytes that Theano currently have allocated on the gpu. * Return the size in bytes that Theano currently have allocated on the gpu.
...@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) ...@@ -2514,7 +2525,13 @@ CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
PyObject * PyObject *
GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy) GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
{ {
return PyLong_FromLong(_allocated_size); PyObject* a = PyLong_FromLong(_allocated_size);
PyObject* b = PyLong_FromLong(_max_allocated_size);
PyObject* tuple = PyTuple_New(2);
PyTuple_SetItem(tuple, 0, a);
PyTuple_SetItem(tuple, 1, b);
return tuple;
} }
#endif #endif
...@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = { ...@@ -2529,6 +2546,11 @@ static PyGetSetDef CudaNdarray_getset[] = {
(setter)CudaNdarray_set_strides, (setter)CudaNdarray_set_strides,
"data pointer strides (in elements)", "data pointer strides (in elements)",
NULL}, NULL},
{"strides",
(getter)CudaNdarray_get_strides,
(setter)CudaNdarray_set_strides,
"data pointer strides (in elements)",
NULL},
//gpudata is needed to allow calling pycuda fct with CudaNdarray input. //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
{"gpudata", {"gpudata",
(getter)CudaNdarray_get_dev_data, (getter)CudaNdarray_get_dev_data,
......
...@@ -417,6 +417,15 @@ class CudaNdarrayType(Type): ...@@ -417,6 +417,15 @@ class CudaNdarrayType(Type):
def c_compile_args(self): def c_compile_args(self):
return [] return []
def get_shape_info(self, obj):
return obj.shape
def get_size(self, shape_info):
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (CudaNdarrayType,) theano.compile.ops.expandable_types += (CudaNdarrayType,)
# Register C code for ViewOp on CudaNdarrayType # Register C code for ViewOp on CudaNdarrayType
......
...@@ -423,6 +423,12 @@ class Scalar(Type): ...@@ -423,6 +423,12 @@ class Scalar(Type):
return (4,) # explicit T given in specialization of operator= return (4,) # explicit T given in specialization of operator=
# lines. This makes it compile with open64 # lines. This makes it compile with open64
def get_shape_info(self, obj):
return obj.itemsize
def get_size(self, shape_info):
return shape_info
# Register C code for ViewOp on Scalars. # Register C code for ViewOp on Scalars.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
Scalar, Scalar,
...@@ -460,6 +466,9 @@ class _scalar_py_operators: ...@@ -460,6 +466,9 @@ class _scalar_py_operators:
# variables and Tensor variables # variables and Tensor variables
ndim = 0 ndim = 0
dtype = property(lambda self: self.type.dtype)
""" The dtype of this scalar. """
#UNARY #UNARY
def __abs__(self): def __abs__(self):
return abs_(self) return abs_(self)
......
...@@ -147,6 +147,17 @@ class SparseType(gof.Type): ...@@ -147,6 +147,17 @@ class SparseType(gof.Type):
def is_valid_value(self, a): def is_valid_value(self, a):
return scipy.sparse.issparse(a) and (a.format == self.format) return scipy.sparse.issparse(a) and (a.format == self.format)
def get_shape_info(self, obj):
obj = self.filter(obj)
assert obj.indices.dtype == 'int32'
assert obj.indptr.dtype == 'int32'
return (obj.shape, obj.data.size,
obj.indices.size, obj.indptr.size, obj.nnz)
def get_size(self, shape_info):
return (shape_info[1] * numpy.dtype(self.dtype).itemsize +
(shape_info[2] + shape_info[3]) * numpy.dtype('int32').itemsize)
# Register SparseType's C code for ViewOp. # Register SparseType's C code for ViewOp.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
SparseType, SparseType,
......
...@@ -1198,6 +1198,38 @@ class TensorType(Type): ...@@ -1198,6 +1198,38 @@ class TensorType(Type):
""" """
return numpy.zeros(shape, dtype=self.dtype) return numpy.zeros(shape, dtype=self.dtype)
def get_shape_info(self, obj):
"""Return the information needed to compute the memory size of obj.
The memory size is only the data, so this exclude the container.
For an ndarray, this is the data, but not the ndarray object and
others data structures as shape and strides.
get_shape_info() and get_size() work in tendem for the memory profiler.
get_shape_info() is called during the execution of the function.
So it is better that it is not too slow.
get_size() will be called with the output of this function
when printing the memory profile.
:param obj: The object that this Type represent during execution
:return: Python object that self.get_size() understand
"""
return obj.shape
def get_size(self, shape_info):
""" Number of bytes taken by the object represented by shape_info
:param shape_info: the output of the call to get_shape_info()
:return: the number of bytes taken by the object described in
shape_info.
"""
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else: # a scalar
return numpy.dtype(self.dtype).itemsize
theano.compile.ops.expandable_types += (TensorType,) theano.compile.ops.expandable_types += (TensorType,)
# Register TensorType C code for ViewOp. # Register TensorType C code for ViewOp.
...@@ -5158,8 +5190,8 @@ def batched_dot(x, y): ...@@ -5158,8 +5190,8 @@ def batched_dot(x, y):
iterating over the first dimension using scan. iterating over the first dimension using scan.
Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4) Returns a tensor of size e.g. if it is 3D: (dim1, dim3, dim4)
Example: Example:
>>> first = T.tensor3('first') >>> first = tensor.tensor3('first')
>>> second = T.tensor3('second') >>> second = tensor.tensor3('second')
>>> result = batched_dot(first, second) >>> result = batched_dot(first, second)
:note: This is a subset of numpy.einsum, but we do not provide it for now. :note: This is a subset of numpy.einsum, but we do not provide it for now.
But numpy einsum is slower than dot or tensordot: But numpy einsum is slower than dot or tensordot:
......
...@@ -56,6 +56,25 @@ class RandomStateType(gof.Type): ...@@ -56,6 +56,25 @@ class RandomStateType(gof.Type):
return False return False
return True return True
def get_shape_info(self, obj):
return None
def get_size(self, shape_info):
# The size is the data, that have constant size.
state = numpy.random.RandomState().get_state()
size = 0
for elem in state:
if isinstance(elem, str):
size += len(elem)
elif isinstance(elem, numpy.ndarray):
size += elem.size * elem.itemsize
elif isinstance(elem, int):
size += numpy.dtype("int").itemsize
elif isinstance(elem, float):
size += numpy.dtype("float").itemsize
else:
raise NotImplementedError()
return size
# Register RandomStateType's C code for ViewOp. # Register RandomStateType's C code for ViewOp.
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
RandomStateType, RandomStateType,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论