提交 3431cc8d authored 作者: abergeron's avatar abergeron

Merge pull request #3701 from nouiz/mgpu_timming

Release the GIL in the new back-end to allow multi-threads computation.
......@@ -5,6 +5,7 @@ and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import threading
import time
import numpy
......@@ -19,46 +20,111 @@ def main(dev1, dev2):
init_dev(dev1, 'ctx1')
init_dev(dev2, 'ctx2')
val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
size = 1024 * 16
data = numpy.random.randn(size, size).astype('float32')
val1a = shared(data, target='ctx1')
val1b = shared(data, target='ctx1')
val1c = shared(data, target='ctx1')
val1d = shared(data, target='ctx1')
val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
val2a = shared(data, target='ctx2')
val2b = shared(data, target='ctx2')
f1 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val1c, val1d)])
f2 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val2a, val2b)])
f3 = theano.function([], [gpu_dot22(val1a, val1b)])
f4 = theano.function([], [gpu_dot22(val2a, val2b)])
f5 = theano.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer('cpu')])
f6 = theano.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer('cpu')])
r = f1()
# pre-execute to load code to GPU.
r = f1.fn()
r[0].sync(), r[1].sync()
r = f2.fn()
r[0].sync(), r[1].sync()
r = f3.fn()
r[0].sync()
r = f4.fn()
r[0].sync()
r = f5.fn()
r = f6.fn()
r = None
t = time.time()
r = f1()
r = f1.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("one ctx %f" % (t2 - t,))
print("one ctx async %f" % (t2 - t,))
r = f2()
t = time.time()
r = f2.fn()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("two ctx async %f" % (t2 - t,))
t = time.time()
r = f2()
r[0].sync(), r[1].sync()
r = f3.fn()
r2 = f4.fn()
r[0].sync()
r2[0].sync()
t2 = time.time()
r = None
print("two ctx, 2 fct async %f" % (t2 - t,))
t = time.time()
r = f5.fn()
r2 = f6.fn()
t2 = time.time()
r = None
print("two ctx, 2 fct with transfer %f" % (t2 - t,))
# Multi-thread version
class myThread (threading.Thread):
def __init__(self, name, f, sync):
threading.Thread.__init__(self)
self.f = f
self.name = name
self.sync = sync
def run(self):
# print "Starting " + self.name
# r = self.f.fn(n_calls=10)
r = self.f()
# print "End " + self.name
if self.sync:
r[0].sync()
self.r = r
# print "Exiting " + self.name
thread1 = myThread("Thread-3", f3, True)
thread2 = myThread("Thread-4", f4, True)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print("two ctx, 2 fct async, 2 threads %f" % (t2 - t,))
thread1 = myThread("Thread-5", f5, False)
thread2 = myThread("Thread-6", f6, False)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print("two ctx, 2 fct with transfer, 2 threads %f" % (t2 - t,))
print("two ctx %f" % (t2 - t,))
if __name__ == '__main__':
import sys
......
......@@ -325,9 +325,11 @@ class HostFromGpu(Op):
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
%(fail)s
}
Py_BEGIN_ALLOW_THREADS
%(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
PyArray_NBYTES(%(out)s),
%(name)s_ga);
Py_END_ALLOW_THREADS
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
if (%(name)serr != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
......@@ -337,7 +339,7 @@ class HostFromGpu(Op):
'out': outputs[0]}
def c_code_cache_version(self):
return (1,)
return (2,)
def grad(self, inputs, grads):
gz, = grads
......@@ -408,8 +410,10 @@ class GpuFromHost(Op):
theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
Py_BEGIN_ALLOW_THREADS
int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_END_ALLOW_THREADS
Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
......@@ -433,7 +437,7 @@ class GpuFromHost(Op):
'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (8,)
return (9,)
class GpuToGpu(Op):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论