提交 583a7553 authored 作者: Frederic's avatar Frederic

Parallelized convolution with OpenMP.

上级 eb894aae
...@@ -54,6 +54,9 @@ Speed up ...@@ -54,6 +54,9 @@ Speed up
it faster in some cases (especially medium/big ouput image) (Frédéric B.) it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card (We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block. support up to 1024 threads per block.
* CPU convolution are now parallelized (Frédric B.)
By default use all cores/hyper-threads
To control it, use the OMP_NUM_THREADS=N environment variable.
New Features New Features
* debugprint new param ids=["CHAR", "id", "int", ""] * debugprint new param ids=["CHAR", "id", "int", ""]
......
...@@ -29,6 +29,8 @@ instructions below for detailed installation steps): ...@@ -29,6 +29,8 @@ instructions below for detailed installation steps):
Not technically required but *highly* recommended, in order to compile Not technically required but *highly* recommended, in order to compile
generated C code. Theano `can` fall back on a NumPy-based Python execution generated C code. Theano `can` fall back on a NumPy-based Python execution
model, but a C compiler allows for vastly faster execution. model, but a C compiler allows for vastly faster execution.
g++ >= 4.2 (for openmp that is currently always used)
more recent version recommended!
`NumPy <http://numpy.scipy.org/>`_ >= 1.3.0 `NumPy <http://numpy.scipy.org/>`_ >= 1.3.0
Earlier versions have memory leaks. Earlier versions have memory leaks.
......
...@@ -840,10 +840,10 @@ class ConvOp(Op): ...@@ -840,10 +840,10 @@ class ConvOp(Op):
return [din, dw] return [din, dw]
def c_headers(self): def c_headers(self):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ] return ['<numpy/noprefix.h>', '<iostream>', '<sstream>', '<omp.h>' ]
def c_code_cache_version(self): def c_code_cache_version(self):
return (6) return (7)
def c_support_code(self): def c_support_code(self):
return """ return """
...@@ -1936,8 +1936,15 @@ if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s; ...@@ -1936,8 +1936,15 @@ if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s; if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for(int b=0;b< %(self_bsize)s;b++){ // We merge the 2 loop into one to make it easier to parallelize on both
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ // This is the equivalent of those 2 lines.
//for(int b=0;b< %(self_bsize)s;b++){
// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
for(int batch_kern_idx=0;
batch_kern_idx < %(self_bsize)s * %(self_nkern)s;
batch_kern_idx++){
int b = batch_kern_idx / %(self_nkern)s;
int n_kern = batch_kern_idx %% %(self_nkern)s;
%(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern)); %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0; for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
...@@ -2062,8 +2069,8 @@ for(int b=0;b< %(self_bsize)s;b++){ ...@@ -2062,8 +2069,8 @@ for(int b=0;b< %(self_bsize)s;b++){
}//for iter_n }//for iter_n
}//for iter_m }//for iter_m
}//for stack_size }//for stack_size
}//for n_kern }//for b and n_kern
}//for b
Py_XDECREF(img2d); Py_XDECREF(img2d);
Py_XDECREF(filtersflipped); Py_XDECREF(filtersflipped);
""" """
...@@ -25,7 +25,8 @@ class TestConv2D(unittest.TestCase): ...@@ -25,7 +25,8 @@ class TestConv2D(unittest.TestCase):
N_image_shape=None, N_filter_shape=None, N_image_shape=None, N_filter_shape=None,
input=None, filters=None, input=None, filters=None,
unroll_batch=None, unroll_kern=None, unroll_patch=None, unroll_batch=None, unroll_kern=None, unroll_patch=None,
verify_grad=True, should_raise=False): verify_grad=True, should_raise=False,
speed_only=False):
if N_image_shape is None: if N_image_shape is None:
N_image_shape = [T.get_constant_value(T. N_image_shape = [T.get_constant_value(T.
...@@ -64,6 +65,8 @@ class TestConv2D(unittest.TestCase): ...@@ -64,6 +65,8 @@ class TestConv2D(unittest.TestCase):
if should_raise: if should_raise:
raise Exception( raise Exception(
"ConvOp should have generated an error") "ConvOp should have generated an error")
if speed_only:
return
############# REFERENCE IMPLEMENTATION ############ ############# REFERENCE IMPLEMENTATION ############
s = 1. s = 1.
...@@ -368,3 +371,16 @@ class TestConv2D(unittest.TestCase): ...@@ -368,3 +371,16 @@ class TestConv2D(unittest.TestCase):
""" """
self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
verify_grad=False) verify_grad=False)
self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
def speed(self):
self.validate((10, 10, 16, 16), (5, 10, 8, 8), 'valid',
verify_grad=False,
unroll_patch=True, speed_only=True)
"""
shape: (10, 10, 16, 16), (5, 10, 8, 8)
num threads 1 2 4
// kern 5.54e-03s 3.12e-03s 1.99e-03s
// batch 4.22e-03s 1.59e-03s 1.25e-03s
// kern_batch3-5-03s 2.51e-03s 9.15e-04s
"""
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论