提交 3431cc8d authored 作者: abergeron's avatar abergeron

Merge pull request #3701 from nouiz/mgpu_timming

Release the GIL in the new back-end to allow multi-threads computation.
...@@ -5,6 +5,7 @@ and two GPU to measure the speedup. ...@@ -5,6 +5,7 @@ and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent. This should be 2x if the GPUs are equivalent.
""" """
import threading
import time import time
import numpy import numpy
...@@ -19,46 +20,111 @@ def main(dev1, dev2): ...@@ -19,46 +20,111 @@ def main(dev1, dev2):
init_dev(dev1, 'ctx1') init_dev(dev1, 'ctx1')
init_dev(dev2, 'ctx2') init_dev(dev2, 'ctx2')
val1a = shared(numpy.random.randn(1024, 1024).astype('float32'), size = 1024 * 16
context_name='ctx1') data = numpy.random.randn(size, size).astype('float32')
val1b = shared(numpy.random.randn(1024, 1024).astype('float32'), val1a = shared(data, target='ctx1')
context_name='ctx1') val1b = shared(data, target='ctx1')
val1c = shared(numpy.random.randn(1024, 1024).astype('float32'), val1c = shared(data, target='ctx1')
context_name='ctx1') val1d = shared(data, target='ctx1')
val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val2a = shared(numpy.random.randn(1024, 1024).astype('float32'), val2a = shared(data, target='ctx2')
context_name='ctx2') val2b = shared(data, target='ctx2')
val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
f1 = theano.function([], [gpu_dot22(val1a, val1b), f1 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val1c, val1d)]) gpu_dot22(val1c, val1d)])
f2 = theano.function([], [gpu_dot22(val1a, val1b), f2 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val2a, val2b)]) gpu_dot22(val2a, val2b)])
f3 = theano.function([], [gpu_dot22(val1a, val1b)])
f4 = theano.function([], [gpu_dot22(val2a, val2b)])
f5 = theano.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer('cpu')])
f6 = theano.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer('cpu')])
r = f1() # pre-execute to load code to GPU.
r = f1.fn()
r[0].sync(), r[1].sync() r[0].sync(), r[1].sync()
r = f2.fn()
r[0].sync(), r[1].sync()
r = f3.fn()
r[0].sync()
r = f4.fn()
r[0].sync()
r = f5.fn()
r = f6.fn()
r = None r = None
t = time.time() t = time.time()
r = f1() r = f1.fn()
r[0].sync(), r[1].sync() r[0].sync(), r[1].sync()
t2 = time.time() t2 = time.time()
r = None r = None
print("one ctx %f" % (t2 - t,)) print("one ctx async %f" % (t2 - t,))
r = f2() t = time.time()
r = f2.fn()
r[0].sync(), r[1].sync() r[0].sync(), r[1].sync()
t2 = time.time()
r = None r = None
print("two ctx async %f" % (t2 - t,))
t = time.time() t = time.time()
r = f2() r = f3.fn()
r[0].sync(), r[1].sync() r2 = f4.fn()
r[0].sync()
r2[0].sync()
t2 = time.time()
r = None
print("two ctx, 2 fct async %f" % (t2 - t,))
t = time.time()
r = f5.fn()
r2 = f6.fn()
t2 = time.time() t2 = time.time()
r = None r = None
print("two ctx, 2 fct with transfer %f" % (t2 - t,))
# Multi-thread version
class myThread (threading.Thread):
def __init__(self, name, f, sync):
threading.Thread.__init__(self)
self.f = f
self.name = name
self.sync = sync
def run(self):
# print "Starting " + self.name
# r = self.f.fn(n_calls=10)
r = self.f()
# print "End " + self.name
if self.sync:
r[0].sync()
self.r = r
# print "Exiting " + self.name
thread1 = myThread("Thread-3", f3, True)
thread2 = myThread("Thread-4", f4, True)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print("two ctx, 2 fct async, 2 threads %f" % (t2 - t,))
thread1 = myThread("Thread-5", f5, False)
thread2 = myThread("Thread-6", f6, False)
t = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
t2 = time.time()
print("two ctx, 2 fct with transfer, 2 threads %f" % (t2 - t,))
print("two ctx %f" % (t2 - t,))
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
......
...@@ -325,9 +325,11 @@ class HostFromGpu(Op): ...@@ -325,9 +325,11 @@ class HostFromGpu(Op):
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga); if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
%(fail)s %(fail)s
} }
Py_BEGIN_ALLOW_THREADS
%(name)serr = GpuArray_read(PyArray_DATA(%(out)s), %(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
PyArray_NBYTES(%(out)s), PyArray_NBYTES(%(out)s),
%(name)s_ga); %(name)s_ga);
Py_END_ALLOW_THREADS
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga); if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
if (%(name)serr != GA_NO_ERROR) { if (%(name)serr != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not read device data."); PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
...@@ -337,7 +339,7 @@ class HostFromGpu(Op): ...@@ -337,7 +339,7 @@ class HostFromGpu(Op):
'out': outputs[0]} 'out': outputs[0]}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
...@@ -408,8 +410,10 @@ class GpuFromHost(Op): ...@@ -408,8 +410,10 @@ class GpuFromHost(Op):
theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp), theano_size_check(%(out)s, PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp), (size_t *)PyArray_DIMS(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) { get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
Py_BEGIN_ALLOW_THREADS
int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp), int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp)); PyArray_NBYTES(%(name)s_tmp));
Py_END_ALLOW_THREADS
Py_DECREF(%(name)s_tmp); Py_DECREF(%(name)s_tmp);
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu"); PyErr_Format(PyExc_RuntimeError, "Could not write data to gpu");
...@@ -433,7 +437,7 @@ class GpuFromHost(Op): ...@@ -433,7 +437,7 @@ class GpuFromHost(Op):
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (9,)
class GpuToGpu(Op): class GpuToGpu(Op):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论