Merge pull request #681 from nouiz/conv

Parallel Conv

Merge pull request #681 from nouiz/conv
a9a8cb77 · lamblin · 9aa36490 · d40f534a · a9a8cb77 · a9a8cb77
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -54,6 +54,9 @@ Speed up
   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
   (We hardcoded 512 as the maximum number of thread per block. Newer card
    support up to 1024 threads per block.
+ * CPU convolution are now parallelized (Frédric B.)
+   By default use all cores/hyper-threads
+   To control it, use the OMP_NUM_THREADS=N environment variable.

 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]

--- a/doc/acknowledgement.txt
+++ b/doc/acknowledgement.txt
+.. _acknowledgement:
+
+
+Acknowledgement
+===============
+
+.. note:
+
+   This page is in construction. We are missing sources.
+
+
+* The developer of `NumPy <http://numpy.scipy.org/>`_. Theano is based on its ndarray object and use many of its implementation.
+* The developer of `Scipy <http://scipy.org/>`_. Our sparse matrix use there sparse matrix. We also reused other part.
+* All Theano authors in the commits log.
+* Al Theano user that gived us feed back.
+* The GPU implementation of tensordot is based on code from Tijmen
+  Tieleman's `gnumpy <http://www.cs.toronto.edu/~tijmen/gnumpy.html>`_
+* The original version of the function ``cpuCount()`` in the file
+  theano/misc/cpucount.py come from the project `pyprocessing
+  <http://pyprocessing.berlios.de/>`_. It is under the same license as
+  Theano.
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -84,6 +84,7 @@ Roughly in order of what you'll want to check out:
 * :ref:`developer` -- Primarily of interest to developers of Theano
 * :ref:`internal` -- How to maintain Theano, LISA-specific tips, and more...
 * :ref:`release` -- How our release should work.
+* :ref:`acknowledgement` -- What we took from other projects.

 You can download the latest `PDF documentation <http://deeplearning.net/software/theano/theano.pdf>`_, rather than reading it online.


--- a/doc/install.txt
+++ b/doc/install.txt
@@ -29,6 +29,8 @@ instructions below for detailed installation steps):
        Not technically required but *highly* recommended, in order to compile
        generated C code. Theano `can` fall back on a NumPy-based Python execution
        model, but a C compiler allows for vastly faster execution.
+	g++ >= 4.2 (for openmp that is currently always used)
+               more recent version recommended!

    `NumPy <http://numpy.scipy.org/>`_ >= 1.3.0
        Earlier versions have memory leaks.

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -169,6 +169,18 @@ import theano and print the config variable, as in:
    and similar functions.  It also sets the default theano bit width for
    arguments passed as Python floating-point numbers.

+.. attribute:: openmp
+
+    Bool value: either True or False
+
+    Default: True if the environment variable OMP_NUM_THREADS!=1 or
+             if we detect more then 1 CPU core. Otherwise False.
+
+    Enable or not parallel computation on the CPU with OpenMP.
+    It is the default value used when creating an Op that support it.
+    The best is to define it via Theano configuration
+    file or with the environment variable THEANO_FLAGS.
+
 .. attribute:: cast_policy

    String value: either 'numpy+floatX' or 'custom'

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -5,7 +5,7 @@ import subprocess
 from theano.configparser import (
        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
        TheanoConfigParser)
-
+from theano.misc.cpucount import cpuCount

 _logger = logging.getLogger('theano.configdefaults')

@@ -16,6 +16,38 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'),
        )

+#http://pyprocessing.berlios.de/
+#True if the environment variable OMP_NUM_THREADS!=1 or
+#if we detect more then 1 CPU core. Otherwise False.
+default_openmp = True
+var = os.getenv('OMP_NUM_THREADS', None)
+if var:
+    try:
+        int(var)
+    except ValueError:
+        raise TypeError("The environment variable OMP_NUM_THREADS"
+                        " should be a number, got '%s'." % var)
+    else:
+        default_openmp = not int(var) == 1
+else:
+    count = cpuCount()
+    if count == -1:
+        _logger.warning("We are not able to detect the number of CPU cores."
+                        " We disable openmp by default. To remove this"
+                        " warning, set the environment variable"
+                        " OMP_NUM_THREADS to the number of threads you"
+                        " want theano to use.")
+    default_openmp = count > 1
+
+AddConfigVar('openmp',
+             "Enable or not parallel computation on the CPU with OpenMP. "
+             "It is the default value used when creating an Op that support it"
+             ". The best is to define it via Theano configuration "
+             "file or with the environment variable THEANO_FLAGS.",
+             BoolParam(default_openmp),
+             in_c_key=False,
+         )
+
 AddConfigVar('cast_policy',
        "Rules for implicit type casting",
        EnumStr('custom', 'numpy+floatX',

--- a/theano/misc/cpucount.py
+++ b/theano/misc/cpucount.py
+# Copyright (c) 2006-2008, R Oudkerk
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. Neither the name of author nor the names of any contributors may be
+#    used to endorse or promote products derived from this software
+#    without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+#
+
+# This function was modified from the original code
+# We can't use the multiprocessing module as it was included in python2.6
+# and we support python 2.4
+
+import os
+import sys
+
+
+def cpuCount():
+    '''
+    Returns the number of CPUs in the system
+    '''
+    if sys.platform == 'win32':
+        try:
+            num = int(os.environ['NUMBER_OF_PROCESSORS'])
+        except (ValueError, KeyError):
+            num = -1
+    elif sys.platform == 'darwin':
+        try:
+            num = int(os.popen('sysctl -n hw.ncpu').read())
+        except ValueError:
+            num = -1
+    else:
+        try:
+            num = os.sysconf('SC_NPROCESSORS_ONLN')
+        except (ValueError, OSError, AttributeError):
+            num = -1
+
+    return num
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -35,8 +35,7 @@ _logger=logging.getLogger("theano.tensor.nnet.conv")

 def conv2d(input, filters, image_shape=None, filter_shape=None,
                border_mode='valid', subsample=(1,1), **kargs):
-    """
-    This function will build the symbolic graph for convolving a stack of input
+    """This function will build the symbolic graph for convolving a stack of input
    images with a set of filters. The implementation is modelled after
    Convolutional Neural Networks (CNN). It is simply a wrapper to the ConvOp but
    provides a much cleaner interface.
@@ -64,10 +63,23 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,
                         Optional, used for optimization.

    :param kwargs: kwargs are passed onto ConvOp. Can be used to set the following:
-                   unroll_batch, unroll_kern, unroll_patch (see ConvOp doc)
+                   unroll_batch, unroll_kern, unroll_patch, openmp (see ConvOp doc)
+
+                   openmp: By default have the same value as
+                           config.openmp. For small image, filter,
+                           batch size, nkern and stack size, it can be
+                           faster to disable manually openmp. A fast and
+                           incomplete test show that with image size
+                           6x6, filter size 4x4, batch size==1,
+                           n kern==1 and stack size==1, it is faster
+                           to disable it in valid mode. But if we
+                           grow the batch size to 10, it is faster
+                           with openmp on a core 2 duo.
+
    :rtype: symbolic 4D tensor
    :return: set of feature maps generated by convolutional layer. Tensor is of shape
      (batch size, nb filters, output row, output col)
+
    """

    #accept Constant value for image_shape and filter_shape.
@@ -136,7 +148,9 @@ class ConvOp(Op):
    __attrnames = ['imshp', 'kshp', 'nkern', 'bsize', 'dx', 'dy', 'out_mode',
            'unroll_batch', 'unroll_kern', 'unroll_patch',
            'imshp_logical', 'kshp_logical', 'kshp_logical_top_aligned']
-    """These attributes uniquely identify the behaviour of this op for given inputs"""
+    """These attributes uniquely identify the behaviour of this op for
+    given inputs. Do not set openmp here.
+    """

 #the value of speed_unroll_batch_kern,speed_unroll_patch_noshape,speed_unroll_patch_shape
 #have bean calculated on maggie36 when their is only 1 session logged on and only this was running.
@@ -202,20 +216,6 @@ class ConvOp(Op):
    #valid time, full time
    speed_unroll_patch_shape=[1.2967290878295898, 5.5283889770507812]

-    def c_compile_args(self):
-        #when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3.
-        #This don't happen at -O2
-        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1,1):
-            return ['-O2']
-        else: return []
-
-    def c_no_compile_args(self):
-        #when the ksph==(1,1) gcc 4.3.0 segfault during the compilation with -O3.
-        #This don't happen at -O2
-        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1,1):
-            return ['-O3']
-        else: return []
-
    @staticmethod
    def getOutputShape(inshp, kshp, stride=(1,1), mode='valid'):
        """
@@ -246,7 +246,8 @@ class ConvOp(Op):
            kshp_logical=None,
            kshp_logical_top_aligned=True,
            verbose=0,
-            version=-1):
+            version=-1,
+            openmp=None):
        """
        Initializes a ConvOp with given output_mode (full/valid). All other
        parameters are optional and are only used to generate more optimized c
@@ -332,8 +333,11 @@ class ConvOp(Op):
        if (unroll_batch>0 or unroll_kern>0) and not all_shape:
            raise Exception("In ConvOp, when using unroll_batch and unroll_nkern, all shape are needed")

+        if openmp is None:
+            openmp = theano.config.openmp

-        if not all_shape:
+        if not all_shape or config.openmp:
+            # Only this version is parallelized
            unroll_patch = True

        if imshp is not None:
@@ -357,6 +361,9 @@ class ConvOp(Op):
        self.dy=dy
        self.verbose=verbose
        self.version=version
+        if openmp == None:
+            openmp = config.openmp
+        self.openmp = openmp

        # a triple
        self.imshp_logical = self.imshp
@@ -483,6 +490,8 @@ class ConvOp(Op):

    def __setstate__(self, d):
        self.__dict__.update(d)
+        if not hasattr(self, "openmp"):
+            self.openmp = False
        self._rehash()

    def _rehash(self):
@@ -854,10 +863,10 @@ class ConvOp(Op):
        return [din, dw]

    def c_headers(self):
-        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
+        return ['<numpy/noprefix.h>', '<iostream>', '<sstream>', '<omp.h>' ]

    def c_code_cache_version(self):
-        return (5)
+        return (8, self.openmp)

    def c_support_code(self):
        return """
@@ -881,16 +890,30 @@ using namespace std;
            return True
        return False

-
    def c_libraries(self):
        if self.use_blas():
            return blas.ldflags()
        return []

+    def c_no_compile_args(self):
+        #when the ksph==(1,1) gcc 4.3.0 segfault during the
+        #compilation with -O3.  This don't happen at -O2
+        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
+            return ['-O3']
+        else:
+            return []
+
    def c_compile_args(self):
+        ret = []
+
        if self.use_blas():
-            return blas.ldflags(libs=False, flags=True)
-        return []
+            ret = blas.ldflags(libs=False, flags=True)
+        if theano.gof.cmodule.gcc_version() in ['4.3.0'] and self.kshp==(1, 1):
+            ret += ['-O2']
+        if self.openmp:
+            ret += ['-fopenmp']
+
+        return ret

    def c_lib_dirs(self):
        if self.use_blas():
@@ -1205,15 +1228,15 @@ int Os[2];
 Os[0]=%(self_outshp0)s;
 Os[1]=%(self_outshp1)s;

+//assertions
+if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
+
 for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){

-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
-
    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;

@@ -1692,14 +1715,15 @@ int Os[2];
 Os[0]=%(self_outshp0)s;
 Os[1]=%(self_outshp1)s;

+//assertions
+if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
+
 for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){

-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
 """%d
    ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
    ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)
@@ -1929,14 +1953,24 @@ if ((!%(z)s)
  //PyArray_FILLWBYTE((PyObject*)%(z)s,0);
 }

-for(int b=0;b< %(self_bsize)s;b++){
-  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
-
-    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+//assertions
+if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
+if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+
+//The if on the number of loop make a speed up for small array.
+//with g++ 4.5.1. The compiler should be smart enough to do this himself!
+#pragma omp parallel for schedule(static) if(%(self_bsize)s * %(self_nkern)s > 1)
+// We merge the 2 loop into one to make it easier to parallelize on both
+// This is the equivalent of those 2 lines.
+//for(int b=0;b< %(self_bsize)s;b++){
+// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
+for(int batch_kern_idx=0;
+    batch_kern_idx < %(self_bsize)s * %(self_nkern)s;
+    batch_kern_idx++){
+    int b = batch_kern_idx / %(self_nkern)s;
+    int n_kern = batch_kern_idx %% %(self_nkern)s;

    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
@@ -2061,8 +2095,8 @@ for(int b=0;b< %(self_bsize)s;b++){
        }//for iter_n
      }//for iter_m
    }//for stack_size
-  }//for n_kern
-}//for b
+}//for b and n_kern
+
 Py_XDECREF(img2d);
 Py_XDECREF(filtersflipped);
 """
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -368,3 +368,41 @@ class TestConv2D(unittest.TestCase):
        """
        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
             verify_grad=False)
+        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
+
+    def speed(self):
+        n_calls = 20000
+        print "n_calls", n_calls
+        for border_mode in ['valid', 'full']:
+            print
+            print border_mode
+            for openmp in [False, True]:
+                print "OpenMP", openmp
+                image_shapes = [(1, 5, 6, 6),
+                                (10, 5, 6, 6),
+                                #(10, 10, 16, 16),
+                                #(10, 10, 32, 32)
+                ]
+                print "image_shape", image_shapes
+                for image_shape in image_shapes:
+                    filter_shapes = [(1, 5, 4, 4), (2, 5, 4, 4), (5, 5, 4, 4)]
+                    print "filter_shapes", filter_shapes
+                    for filter_shape in filter_shapes:
+
+                        input = theano.shared(numpy.random.random(image_shape))
+                        filters = theano.shared(numpy.random.random(filter_shape))
+
+                        output = conv.conv2d(input, filters,
+                                             image_shape, filter_shape,
+                                             border_mode,
+                                             unroll_patch=True,
+                                             openmp=openmp)
+                        mode = theano.Mode(linker=theano.gof.vm.VM_Linker(
+                            allow_gc=False,
+                            use_cloop=True))
+                        theano_conv = theano.function([], output, mode=mode)
+                        t1 = time.time()
+                        theano_conv.fn(n_calls=n_calls)
+                        t2 = time.time()
+                        print t2 - t1,
+                    print