提交 5c6712e0 authored 作者: James Bergstra's avatar James Bergstra

merge

import time
import time, atexit
from ..gof.link import WrapLinkerMany
from ..gof.cutils import run_cthunk
from ..compile.mode import Mode, predefined_linkers
from ..compile.mode import Mode, predefined_linkers, register_mode, predefined_modes
from ..gof.cc import OpWiseCLinker
class ProfileMode(Mode):
......@@ -110,3 +110,19 @@ class ProfileMode(Mode):
sum(f for f, t, a in sotimes[n_ops_to_print:])*100,
sum(t for f, t, a in sotimes[n_ops_to_print:]))
print '(*) Op is running a c implementation'
register_mode('PROFILE_MODE',ProfileMode())
def atexit_print_default_profile_mode():
"""Print the summary of the predefied mode PROFILE_MODE if used.
This all to have the summary printed at exit when we do
THEANO_DEFAULT_MODE=PROFILE_MODE
"""
prof_mode=predefined_modes["PROFILE_MODE"]
if prof_mode.local_time[0]>0: prof_mode.print_summary()
#Register atexit_print_default_profile_mode to have the summary of the
#predefined mode PROFILE_MODE if it is used printed when the program terminate.
atexit.register(atexit_print_default_profile_mode)
......@@ -72,7 +72,7 @@ class BROKEN_ON_PURPOSE_StructuredDotCSC(gof.Op):
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0,0};
dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
dims[1] = %(b)s->dimensions[1];
......@@ -189,13 +189,13 @@ class WeirdBrokenOp(gof.Op):
def c_code(self, node, name, (a,), (z,), sub):
if "inplace" in self.behaviour:
z_code = """
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
Py_INCREF(%(a)s);
%(z)s = %(a)s;
"""
else:
z_code = """
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
%(z)s = (PyArrayObject*) PyArray_SimpleNew(1, %(a)s->dimensions, %(a)s->descr->type_num);
"""
prep_vars = """
......
......@@ -144,18 +144,18 @@ def struct_gen(args, struct_builders, blocks, sub):
PyObject* err_msg = NULL;
PyObject* err_traceback = NULL;
PyErr_Fetch(&err_type, &err_msg, &err_traceback);
if (!err_type) {err_type = Py_None; Py_XINCREF(Py_None);}
if (!err_msg) {err_msg = Py_None; Py_XINCREF(Py_None);}
if (!err_traceback) {err_traceback = Py_None; Py_XINCREF(Py_None);}
if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
PyList_SET_ITEM(__ERROR, 0, err_type);
PyList_SET_ITEM(__ERROR, 1, err_msg);
PyList_SET_ITEM(__ERROR, 2, err_traceback);
Py_XDECREF(old_err_type);
Py_XDECREF(old_err_msg);
Py_XDECREF(old_err_traceback);
{Py_XDECREF(old_err_type);}
{Py_XDECREF(old_err_msg);}
{Py_XDECREF(old_err_traceback);}
}
// The failure code is returned to index what code block failed.
return %(failure_var)s;
......@@ -222,7 +222,7 @@ def get_c_init(r, name, sub):
"""WRITEME"""
pre = "" """
py_%(name)s = Py_None;
Py_XINCREF(py_%(name)s);
{Py_XINCREF(py_%(name)s);}
""" % locals()
return pre + r.type.c_init(name, sub)
......@@ -230,14 +230,14 @@ def get_c_extract(r, name, sub):
"""WRITEME"""
pre = """
py_%(name)s = PyList_GET_ITEM(storage_%(name)s, 0);
Py_XINCREF(py_%(name)s);
{Py_XINCREF(py_%(name)s);}
""" % locals()
return pre + r.type.c_extract(name, sub)
def get_c_cleanup(r, name, sub):
"""WRITEME"""
post = """
Py_XDECREF(py_%(name)s);
{Py_XDECREF(py_%(name)s);}
""" % locals()
return r.type.c_cleanup(name, sub) + post
......@@ -247,9 +247,9 @@ def get_c_sync(r, name, sub):
if (!%(failure_var)s) {
%(sync)s
PyObject* old = PyList_GET_ITEM(storage_%(name)s, 0);
Py_XINCREF(py_%(name)s);
{Py_XINCREF(py_%(name)s);}
PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s);
Py_XDECREF(old);
{Py_XDECREF(old);}
}
""" % dict(sync = r.type.c_sync(name, sub), name = name, **sub)
......
......@@ -28,8 +28,9 @@ class ConvOp(Op):
#TODO: make the stacksize its own parameter, and make imshp a pair
def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid', unroll_batch=0,
unroll_kern=0,
def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid',
unroll_batch=4,
unroll_kern=4,
imshp_logical=None,
kshp_logical=None,
kshp_logical_top_aligned=True):
......@@ -57,6 +58,10 @@ class ConvOp(Op):
unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code.
unroll_nkern. idem as unroll_batch but unroll the kernel loop.
The version is with unroll_batch=4 and unroll_nkern if possible(currenctly it don't support logical shape != physical shape) as this is what give the best performance in practice. This also tell that to have the best performance, you should have a batch size and a number of kernel multiple of 4. In the article:
Anatomy of High-Performance Matrix Multiplication by Kazushige Goto and Robert A. Van De Geijn, ACM Transactions on Mathematical Software, vol 34, No. 3, article 12, May 2008.
In figure 12, it give the value mr x nr, those value are the optimum to use for unroll_batch and unroll_kern. For x86_64 bits computer it is 4x4. Other architecture can have different value.(2x4 for x86, 8x8 for itanium,...)
"""
imshp = tuple(imshp)
if len(imshp)==2:
......@@ -517,9 +522,8 @@ if ((!%(z)s)
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
if(!dims) %(fail)s;
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
......@@ -746,7 +750,7 @@ if ((!%(z)s)
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
......@@ -1002,9 +1006,8 @@ if ((!%(z)s)
|| (%(z)s->dimensions[3] != dim_zz[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[4] = {0,0,0,0};
if(!dims) %(fail)s;
dims[0]=%(self_bsize)s;
dims[1]=%(self_nkern)s;
dims[2]=dim_zz[0];
......
......@@ -819,7 +819,7 @@ class StructuredDotCSC(gof.Op):
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1])
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0,0};
dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
dims[1] = %(b)s->dimensions[1];
......@@ -951,7 +951,7 @@ class StructuredDotCSR(gof.Op):
|| (%(z)s->dimensions[1] != %(b)s->dimensions[1]) //b's columns
)
{
if (%(z)s) Py_DECREF(%(z)s);
{Py_XDECREF(%(z)s);}
npy_intp dims[] = {0,0};
dims[0] = %(a_ptr)s->dimensions[0]-1;
dims[1] = %(b)s->dimensions[1];
......
......@@ -394,15 +394,15 @@ class TensorType(Type):
def c_sync(self, name, sub):
"""Override `CLinkerOp.c_sync` """
return """
Py_XDECREF(py_%(name)s);
{Py_XDECREF(py_%(name)s);}
if (!%(name)s) {
Py_XINCREF(Py_None);
Py_INCREF(Py_None);
py_%(name)s = Py_None;
}
else if ((void*)py_%(name)s != (void*)%(name)s) {
py_%(name)s = (PyObject*)%(name)s;
}
Py_XINCREF(py_%(name)s);
{Py_XINCREF(py_%(name)s);}
""" % locals()
def c_headers(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论