Merge pull request #681 from nouiz/conv

Parallel Conv

Merge pull request #681 from nouiz/conv
a9a8cb77 · lamblin · 9aa36490 · d40f534a · a9a8cb77 · a9a8cb77
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -54,6 +54,9 @@ Speed up
   it faster in some cases (especially medium/big ouput image) (Frédéric B.)
   (We hardcoded 512 as the maximum number of thread per block. Newer card
    support up to 1024 threads per block.
+ * CPU convolution are now parallelized (Frédric B.)
+   By default use all cores/hyper-threads
+   To control it, use the OMP_NUM_THREADS=N environment variable.

 New Features
 * debugprint new param ids=["CHAR", "id", "int", ""]

--- a/doc/acknowledgement.txt
+++ b/doc/acknowledgement.txt
+.. _acknowledgement:
+
+
+Acknowledgement
+===============
+
+.. note:
+
+   This page is in construction. We are missing sources.
+
+
+* The developer of `NumPy <http://numpy.scipy.org/>`_. Theano is based on its ndarray object and use many of its implementation.
+* The developer of `Scipy <http://scipy.org/>`_. Our sparse matrix use there sparse matrix. We also reused other part.
+* All Theano authors in the commits log.
+* Al Theano user that gived us feed back.
+* The GPU implementation of tensordot is based on code from Tijmen
+  Tieleman's `gnumpy <http://www.cs.toronto.edu/~tijmen/gnumpy.html>`_
+* The original version of the function ``cpuCount()`` in the file
+  theano/misc/cpucount.py come from the project `pyprocessing
+  <http://pyprocessing.berlios.de/>`_. It is under the same license as
+  Theano.
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -84,6 +84,7 @@ Roughly in order of what you'll want to check out:
 * :ref:`developer` -- Primarily of interest to developers of Theano
 * :ref:`internal` -- How to maintain Theano, LISA-specific tips, and more...
 * :ref:`release` -- How our release should work.
+* :ref:`acknowledgement` -- What we took from other projects.

 You can download the latest `PDF documentation <http://deeplearning.net/software/theano/theano.pdf>`_, rather than reading it online.


--- a/doc/install.txt
+++ b/doc/install.txt
@@ -29,6 +29,8 @@ instructions below for detailed installation steps):
        Not technically required but *highly* recommended, in order to compile
        generated C code. Theano `can` fall back on a NumPy-based Python execution
        model, but a C compiler allows for vastly faster execution.
+	g++ >= 4.2 (for openmp that is currently always used)
+               more recent version recommended!

    `NumPy <http://numpy.scipy.org/>`_ >= 1.3.0
        Earlier versions have memory leaks.

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -169,6 +169,18 @@ import theano and print the config variable, as in:
    and similar functions.  It also sets the default theano bit width for
    arguments passed as Python floating-point numbers.

+.. attribute:: openmp
+
+    Bool value: either True or False
+
+    Default: True if the environment variable OMP_NUM_THREADS!=1 or
+             if we detect more then 1 CPU core. Otherwise False.
+
+    Enable or not parallel computation on the CPU with OpenMP.
+    It is the default value used when creating an Op that support it.
+    The best is to define it via Theano configuration
+    file or with the environment variable THEANO_FLAGS.
+
 .. attribute:: cast_policy

    String value: either 'numpy+floatX' or 'custom'

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -5,7 +5,7 @@ import subprocess
 from theano.configparser import (
        AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
        TheanoConfigParser)
-
+from theano.misc.cpucount import cpuCount

 _logger = logging.getLogger('theano.configdefaults')

@@ -16,6 +16,38 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'),
        )

+#http://pyprocessing.berlios.de/
+#True if the environment variable OMP_NUM_THREADS!=1 or
+#if we detect more then 1 CPU core. Otherwise False.
+default_openmp = True
+var = os.getenv('OMP_NUM_THREADS', None)
+if var:
+    try:
+        int(var)
+    except ValueError:
+        raise TypeError("The environment variable OMP_NUM_THREADS"
+                        " should be a number, got '%s'." % var)
+    else:
+        default_openmp = not int(var) == 1
+else:
+    count = cpuCount()
+    if count == -1:
+        _logger.warning("We are not able to detect the number of CPU cores."
+                        " We disable openmp by default. To remove this"
+                        " warning, set the environment variable"
+                        " OMP_NUM_THREADS to the number of threads you"
+                        " want theano to use.")
+    default_openmp = count > 1
+
+AddConfigVar('openmp',
+             "Enable or not parallel computation on the CPU with OpenMP. "
+             "It is the default value used when creating an Op that support it"
+             ". The best is to define it via Theano configuration "
+             "file or with the environment variable THEANO_FLAGS.",
+             BoolParam(default_openmp),
+             in_c_key=False,
+         )
+
 AddConfigVar('cast_policy',
        "Rules for implicit type casting",
        EnumStr('custom', 'numpy+floatX',

--- a/theano/misc/cpucount.py
+++ b/theano/misc/cpucount.py
+# Copyright (c) 2006-2008, R Oudkerk
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. Neither the name of author nor the names of any contributors may be
+#    used to endorse or promote products derived from this software
+#    without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+#
+
+# This function was modified from the original code
+# We can't use the multiprocessing module as it was included in python2.6
+# and we support python 2.4
+
+import os
+import sys
+
+
+def cpuCount():
+    '''
+    Returns the number of CPUs in the system
+    '''
+    if sys.platform == 'win32':
+        try:
+            num = int(os.environ['NUMBER_OF_PROCESSORS'])
+        except (ValueError, KeyError):
+            num = -1
+    elif sys.platform == 'darwin':
+        try:
+            num = int(os.popen('sysctl -n hw.ncpu').read())
+        except ValueError:
+            num = -1
+    else:
+        try:
+            num = os.sysconf('SC_NPROCESSORS_ONLN')
+        except (ValueError, OSError, AttributeError):
+            num = -1
+
+    return num
--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -368,3 +368,41 @@ class TestConv2D(unittest.TestCase):
        """
        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
             verify_grad=False)
+        self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
+
+    def speed(self):
+        n_calls = 20000
+        print "n_calls", n_calls
+        for border_mode in ['valid', 'full']:
+            print
+            print border_mode
+            for openmp in [False, True]:
+                print "OpenMP", openmp
+                image_shapes = [(1, 5, 6, 6),
+                                (10, 5, 6, 6),
+                                #(10, 10, 16, 16),
+                                #(10, 10, 32, 32)
+                ]
+                print "image_shape", image_shapes
+                for image_shape in image_shapes:
+                    filter_shapes = [(1, 5, 4, 4), (2, 5, 4, 4), (5, 5, 4, 4)]
+                    print "filter_shapes", filter_shapes
+                    for filter_shape in filter_shapes:
+
+                        input = theano.shared(numpy.random.random(image_shape))
+                        filters = theano.shared(numpy.random.random(filter_shape))
+
+                        output = conv.conv2d(input, filters,
+                                             image_shape, filter_shape,
+                                             border_mode,
+                                             unroll_patch=True,
+                                             openmp=openmp)
+                        mode = theano.Mode(linker=theano.gof.vm.VM_Linker(
+                            allow_gc=False,
+                            use_cloop=True))
+                        theano_conv = theano.function([], output, mode=mode)
+                        t1 = time.time()
+                        theano_conv.fn(n_calls=n_calls)
+                        t2 = time.time()
+                        print t2 - t1,
+                    print