Parallelized convolution with OpenMP.

583a7553 · Frederic · eb894aae · 583a7553 · 583a7553 · 583a7553
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -54,6 +54,9 @@ Speed up
   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
   (We hardcoded 512 as the maximum number of thread per block. Newer card
    support up to 1024 threads per block.
+ * CPU convolution are now parallelized (Frédric B.)
+   By default use all cores/hyper-threads
+   To control it, use the OMP_NUM_THREADS=N environment variable.
 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -29,6 +29,8 @@ instructions below for detailed installation steps):
        Not technically required but *highly* recommended, in order to compile
        generated C code. Theano `can` fall back on a NumPy-based Python execution
        model, but a C compiler allows for vastly faster execution.
+	g++ >= 4.2 (for openmp that is currently always used)
+               more recent version recommended!
    `NumPy <http://numpy.scipy.org/>`_ >= 1.3.0
        Earlier versions have memory leaks.

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -840,10 +840,10 @@ class ConvOp(Op):
        return [din, dw]
    def c_headers(self):
-        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
+        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>', '<omp.h>' ]
    def c_code_cache_version(self):
-        return (6)
+        return (7)
    def c_support_code(self):
        return """
@@ -1936,8 +1936,15 @@ if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
 if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
 #pragma omp parallel for schedule(static)
-for(int b=0;b< %(self_bsize)s;b++){
+// We merge the 2 loop into one to make it easier to parallelize on both
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
+// This is the equivalent of those 2 lines.
+//for(int b=0;b< %(self_bsize)s;b++){
+// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
+for(int batch_kern_idx=0;
+    batch_kern_idx < %(self_bsize)s * %(self_nkern)s;
+    batch_kern_idx++){
+    int b = batch_kern_idx / %(self_nkern)s;
+    int n_kern = batch_kern_idx %% %(self_nkern)s;
    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
@@ -2062,8 +2069,8 @@ for(int b=0;b< %(self_bsize)s;b++){
        }//for iter_n
      }//for iter_m
    }//for stack_size
-  }//for n_kern
+}//for b and n_kern
-}//for b
 Py_XDECREF(img2d);
 Py_XDECREF(filtersflipped);
 """
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -25,7 +25,8 @@ class TestConv2D(unittest.TestCase):
                 N_image_shape=None, N_filter_shape=None,
                 input=None, filters=None,
                 unroll_batch=None, unroll_kern=None, unroll_patch=None,
-                 verify_grad=True, should_raise=False):
+                 verify_grad=True, should_raise=False,
+                 speed_only=False):
        if N_image_shape is None:
            N_image_shape = [T.get_constant_value(T.
@@ -64,6 +65,8 @@ class TestConv2D(unittest.TestCase):
            if should_raise:
                raise Exception(
                "ConvOp should have generated an error")
+        if speed_only:
+            return
        ############# REFERENCE IMPLEMENTATION ############
        s = 1.
@@ -368,3 +371,16 @@ class TestConv2D(unittest.TestCase):
        """
        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
             verify_grad=False)
+        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
+    def speed(self):
+        self.validate((10, 10, 16, 16), (5, 10, 8, 8), 'valid',
+                      verify_grad=False,
+                      unroll_patch=True, speed_only=True)
+        """
+        shape: (10, 10, 16, 16), (5, 10, 8, 8)
+        num threads       1          2          4
+        // kern      5.54e-03s  3.12e-03s  1.99e-03s
+        // batch     4.22e-03s  1.59e-03s  1.25e-03s
+        // kern_batch3-5-03s    2.51e-03s  9.15e-04s
+        """