merge

5c6712e0 · James Bergstra · d4838fe0 · 30a02a52 · 5c6712e0 · 5c6712e0
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
-import time
+import time, atexit

 from ..gof.link import WrapLinkerMany
 from ..gof.cutils import run_cthunk
-from ..compile.mode import Mode, predefined_linkers
+from ..compile.mode import Mode, predefined_linkers, register_mode, predefined_modes
 from ..gof.cc import OpWiseCLinker

 class ProfileMode(Mode):
@@ -110,3 +110,19 @@ class ProfileMode(Mode):
                  sum(f for f, t, a in sotimes[n_ops_to_print:])*100,
                  sum(t for f, t, a in sotimes[n_ops_to_print:]))
        print '(*) Op is running a c implementation'
+
+
+register_mode('PROFILE_MODE',ProfileMode())
+
+def atexit_print_default_profile_mode():
+    """Print the summary of the predefied mode PROFILE_MODE if used.
+    
+    This all to have the summary printed at exit when we do
+    THEANO_DEFAULT_MODE=PROFILE_MODE
+    """
+    prof_mode=predefined_modes["PROFILE_MODE"]
+    if prof_mode.local_time[0]>0: prof_mode.print_summary()
+
+#Register atexit_print_default_profile_mode to have the summary of the
+#predefined mode PROFILE_MODE if it is used printed when the program terminate.
+atexit.register(atexit_print_default_profile_mode)
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -72,7 +72,7 @@ class BROKEN_ON_PURPOSE_StructuredDotCSC(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
            dims[1] = %(b)s->dimensions[1];
@@ -189,13 +189,13 @@ class WeirdBrokenOp(gof.Op):
    def c_code(self, node, name, (a,), (z,), sub):
        if "inplace" in self.behaviour:
            z_code = """
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            Py_INCREF(%(a)s);
            %(z)s = %(a)s;
            """
        else:
            z_code = """
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, %(a)s->dimensions, %(a)s->descr->type_num);
            """
        prep_vars = """

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -144,18 +144,18 @@ def struct_gen(args, struct_builders, blocks, sub):
            PyObject* err_msg = NULL;
            PyObject* err_traceback = NULL;
            PyErr_Fetch(&err_type, &err_msg, &err_traceback);
-            if (!err_type) {err_type = Py_None; Py_XINCREF(Py_None);}
-            if (!err_msg) {err_msg = Py_None; Py_XINCREF(Py_None);}
-            if (!err_traceback) {err_traceback = Py_None; Py_XINCREF(Py_None);}
+            if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
+            if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
+            if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
            PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
            PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
            PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
            PyList_SET_ITEM(__ERROR, 0, err_type);
            PyList_SET_ITEM(__ERROR, 1, err_msg);
            PyList_SET_ITEM(__ERROR, 2, err_traceback);
-            Py_XDECREF(old_err_type);
-            Py_XDECREF(old_err_msg);
-            Py_XDECREF(old_err_traceback);
+            {Py_XDECREF(old_err_type);}
+            {Py_XDECREF(old_err_msg);}
+            {Py_XDECREF(old_err_traceback);}
        }
        // The failure code is returned to index what code block failed.
        return %(failure_var)s;
@@ -222,7 +222,7 @@ def get_c_init(r, name, sub):
    """WRITEME"""
    pre = "" """
    py_%(name)s = Py_None;
-    Py_XINCREF(py_%(name)s);
+    {Py_XINCREF(py_%(name)s);}
    """ % locals()
    return pre + r.type.c_init(name, sub)

@@ -230,14 +230,14 @@ def get_c_extract(r, name, sub):
    """WRITEME"""
    pre = """
    py_%(name)s = PyList_GET_ITEM(storage_%(name)s, 0);
-    Py_XINCREF(py_%(name)s);
+    {Py_XINCREF(py_%(name)s);}
    """ % locals()
    return pre + r.type.c_extract(name, sub)

 def get_c_cleanup(r, name, sub):
    """WRITEME"""
    post = """
-    Py_XDECREF(py_%(name)s);
+    {Py_XDECREF(py_%(name)s);}
    """ % locals()
    return r.type.c_cleanup(name, sub) + post

@@ -247,9 +247,9 @@ def get_c_sync(r, name, sub):
    if (!%(failure_var)s) {
      %(sync)s
      PyObject* old = PyList_GET_ITEM(storage_%(name)s, 0);
-      Py_XINCREF(py_%(name)s);
+      {Py_XINCREF(py_%(name)s);}
      PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s);
-      Py_XDECREF(old);
+      {Py_XDECREF(old);}
    }
    """ % dict(sync = r.type.c_sync(name, sub), name = name, **sub)


--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -28,8 +28,9 @@ class ConvOp(Op):

    #TODO: make the stacksize its own parameter, and make imshp a pair

-    def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid', unroll_batch=0,
-            unroll_kern=0,
+    def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid',
+            unroll_batch=4,
+            unroll_kern=4,
            imshp_logical=None,
            kshp_logical=None,
            kshp_logical_top_aligned=True):
@@ -57,6 +58,10 @@ class ConvOp(Op):

        unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code.
        unroll_nkern. idem as unroll_batch but unroll the kernel loop.
+
+        The version is with unroll_batch=4 and unroll_nkern if possible(currenctly it don't support logical shape != physical shape) as this is what give the best performance in practice. This also tell that to have the best performance, you should have a batch size and a number of kernel multiple of 4. In the article:
+        Anatomy of High-Performance Matrix Multiplication by Kazushige Goto and Robert A. Van De Geijn, ACM Transactions on Mathematical Software, vol 34, No. 3, article 12, May 2008.
+        In figure 12, it give the value mr x nr, those value are the optimum to use for unroll_batch and unroll_kern. For x86_64 bits computer it is 4x4. Other architecture can have different value.(2x4 for x86, 8x8 for itanium,...)
        """
        imshp = tuple(imshp)
        if len(imshp)==2:
@@ -517,9 +522,8 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
  dims[2]=dim_zz[0];
@@ -746,7 +750,7 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
@@ -1002,9 +1006,8 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
  dims[2]=dim_zz[0];

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -819,7 +819,7 @@ class StructuredDotCSC(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
            dims[1] = %(b)s->dimensions[1];
@@ -951,7 +951,7 @@ class StructuredDotCSR(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])       //b's columns
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = %(a_ptr)s->dimensions[0]-1;
            dims[1] = %(b)s->dimensions[1];

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -394,15 +394,15 @@ class TensorType(Type):
    def c_sync(self, name, sub):
        """Override `CLinkerOp.c_sync` """
        return """
-        Py_XDECREF(py_%(name)s);
+        {Py_XDECREF(py_%(name)s);}
        if (!%(name)s) {
-            Py_XINCREF(Py_None);
+            Py_INCREF(Py_None);
            py_%(name)s = Py_None;
        }
        else if ((void*)py_%(name)s != (void*)%(name)s) {
            py_%(name)s = (PyObject*)%(name)s;
        }
-        Py_XINCREF(py_%(name)s);
+        {Py_XINCREF(py_%(name)s);}
        """ % locals()

    def c_headers(self):