Merge pull request #3701 from nouiz/mgpu_timming

Release the GIL in the new back-end to allow multi-threads computation.

Merge pull request #3701 from nouiz/mgpu_timming
3431cc8d · abergeron · 17fa9f7a · 90738add · 3431cc8d · 3431cc8d
--- a/theano/misc/check_multi_gpu.py
+++ b/theano/misc/check_multi_gpu.py
@@ -5,6 +5,7 @@ and two GPU to measure the speedup.

 This should be 2x if the GPUs are equivalent.
 """
+import threading
 import time

 import numpy
@@ -19,46 +20,111 @@ def main(dev1, dev2):
    init_dev(dev1, 'ctx1')
    init_dev(dev2, 'ctx2')

-    val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx1')
-    val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx1')
-    val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx1')
-    val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx1')
+    size = 1024 * 16
+    data = numpy.random.randn(size, size).astype('float32')
+    val1a = shared(data, target='ctx1')
+    val1b = shared(data, target='ctx1')
+    val1c = shared(data, target='ctx1')
+    val1d = shared(data, target='ctx1')

-    val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx2')
-    val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
-                   context_name='ctx2')
+    val2a = shared(data, target='ctx2')
+    val2b = shared(data, target='ctx2')

    f1 = theano.function([], [gpu_dot22(val1a, val1b),
                              gpu_dot22(val1c, val1d)])
    f2 = theano.function([], [gpu_dot22(val1a, val1b),
                              gpu_dot22(val2a, val2b)])
+    f3 = theano.function([], [gpu_dot22(val1a, val1b)])
+    f4 = theano.function([], [gpu_dot22(val2a, val2b)])
+    f5 = theano.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer('cpu')])
+    f6 = theano.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer('cpu')])

-    r = f1()
+    # pre-execute to load code to GPU.
+    r = f1.fn()
    r[0].sync(), r[1].sync()
+    r = f2.fn()
+    r[0].sync(), r[1].sync()
+    r = f3.fn()
+    r[0].sync()
+    r = f4.fn()
+    r[0].sync()
+    r = f5.fn()
+    r = f6.fn()
    r = None
+
    t = time.time()
-    r = f1()
+    r = f1.fn()
    r[0].sync(), r[1].sync()
    t2 = time.time()
    r = None

-    print("one ctx %f" % (t2 - t,))
+    print("one ctx async %f" % (t2 - t,))

-    r = f2()
+    t = time.time()
+    r = f2.fn()
    r[0].sync(), r[1].sync()
+    t2 = time.time()
    r = None
+
+    print("two ctx async %f" % (t2 - t,))
+
    t = time.time()
-    r = f2()
-    r[0].sync(), r[1].sync()
+    r = f3.fn()
+    r2 = f4.fn()
+    r[0].sync()
+    r2[0].sync()
+    t2 = time.time()
+    r = None
+
+    print("two ctx, 2 fct async %f" % (t2 - t,))
+
+    t = time.time()
+    r = f5.fn()
+    r2 = f6.fn()
    t2 = time.time()
    r = None
+    print("two ctx, 2 fct with transfer %f" % (t2 - t,))
+
+    # Multi-thread version
+    class myThread (threading.Thread):
+        def __init__(self, name, f, sync):
+            threading.Thread.__init__(self)
+            self.f = f
+            self.name = name
+            self.sync = sync
+
+        def run(self):
+            # print "Starting " + self.name
+            # r = self.f.fn(n_calls=10)
+            r = self.f()
+            # print "End " + self.name
+            if self.sync:
+                r[0].sync()
+            self.r = r
+            # print "Exiting " + self.name
+
+    thread1 = myThread("Thread-3", f3, True)
+    thread2 = myThread("Thread-4", f4, True)
+    t = time.time()
+    thread1.start()
+    thread2.start()
+    thread1.join()
+    thread2.join()
+    t2 = time.time()
+
+    print("two ctx, 2 fct async, 2 threads %f" % (t2 - t,))
+
+    thread1 = myThread("Thread-5", f5, False)
+    thread2 = myThread("Thread-6", f6, False)
+    t = time.time()
+    thread1.start()
+    thread2.start()
+    thread1.join()
+    thread2.join()
+    t2 = time.time()
+
+    print("two ctx, 2 fct with transfer, 2 threads %f" % (t2 - t,))

-    print("two ctx %f" % (t2 - t,))

 if __name__ == '__main__':
    import sys

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -325,9 +325,11 @@ class HostFromGpu(Op):
            if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
            %(fail)s
        }
+        Py_BEGIN_ALLOW_THREADS
        %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
                                    PyArray_NBYTES(%(out)s),
                                    %(name)s_ga);
+        Py_END_ALLOW_THREADS
        if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
        if (%(name)serr != GA_NO_ERROR) {
            PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
@@ -337,7 +339,7 @@ class HostFromGpu(Op):
               'out': outputs[0]}

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def grad(self, inputs, grads):
        gz, = grads
@@ -408,8 +410,10 @@ class GpuFromHost(Op):
            theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
                              (size_t *)PyArray_DIMS(%(name)s_tmp),
                              get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
+          Py_BEGIN_ALLOW_THREADS
          int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
                                   PyArray_NBYTES(%(name)s_tmp));
+          Py_END_ALLOW_THREADS
          Py_DECREF(%(name)s_tmp);
          if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
@@ -433,7 +437,7 @@ class GpuFromHost(Op):
               'out': outputs[0], 'fail': sub['fail']}

    def c_code_cache_version(self):
-        return (8,)
+        return (9,)


 class GpuToGpu(Op):