提交 583a7553 authored 作者: Frederic's avatar Frederic

Parallelized convolution with OpenMP.

上级 eb894aae
......@@ -54,6 +54,9 @@ Speed up
it faster in some cases (especially medium/big ouput image) (Frédéric B.)
(We hardcoded 512 as the maximum number of thread per block. Newer card
support up to 1024 threads per block.
* CPU convolution are now parallelized (Frédric B.)
By default use all cores/hyper-threads
To control it, use the OMP_NUM_THREADS=N environment variable.
New Features
* debugprint new param ids=["CHAR", "id", "int", ""]
......
......@@ -29,6 +29,8 @@ instructions below for detailed installation steps):
Not technically required but *highly* recommended, in order to compile
generated C code. Theano `can` fall back on a NumPy-based Python execution
model, but a C compiler allows for vastly faster execution.
g++ >= 4.2 (for openmp that is currently always used)
more recent version recommended!
`NumPy <http://numpy.scipy.org/>`_ >= 1.3.0
Earlier versions have memory leaks.
......
......@@ -840,10 +840,10 @@ class ConvOp(Op):
return [din, dw]
def c_headers(self):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>' ]
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>', '<omp.h>' ]
def c_code_cache_version(self):
return (6)
return (7)
def c_support_code(self):
return """
......@@ -1936,8 +1936,15 @@ if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
#pragma omp parallel for schedule(static)
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
// We merge the 2 loop into one to make it easier to parallelize on both
// This is the equivalent of those 2 lines.
//for(int b=0;b< %(self_bsize)s;b++){
// for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
for(int batch_kern_idx=0;
batch_kern_idx < %(self_bsize)s * %(self_nkern)s;
batch_kern_idx++){
int b = batch_kern_idx / %(self_nkern)s;
int n_kern = batch_kern_idx %% %(self_nkern)s;
%(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
......@@ -2062,8 +2069,8 @@ for(int b=0;b< %(self_bsize)s;b++){
}//for iter_n
}//for iter_m
}//for stack_size
}//for n_kern
}//for b
}//for b and n_kern
Py_XDECREF(img2d);
Py_XDECREF(filtersflipped);
"""
......@@ -25,7 +25,8 @@ class TestConv2D(unittest.TestCase):
N_image_shape=None, N_filter_shape=None,
input=None, filters=None,
unroll_batch=None, unroll_kern=None, unroll_patch=None,
verify_grad=True, should_raise=False):
verify_grad=True, should_raise=False,
speed_only=False):
if N_image_shape is None:
N_image_shape = [T.get_constant_value(T.
......@@ -64,6 +65,8 @@ class TestConv2D(unittest.TestCase):
if should_raise:
raise Exception(
"ConvOp should have generated an error")
if speed_only:
return
############# REFERENCE IMPLEMENTATION ############
s = 1.
......@@ -368,3 +371,16 @@ class TestConv2D(unittest.TestCase):
"""
self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid',
verify_grad=False)
self.validate((1, 10, 213, 129), (46, 10, 212, 1), 'valid', verify_grad=False)
def speed(self):
self.validate((10, 10, 16, 16), (5, 10, 8, 8), 'valid',
verify_grad=False,
unroll_patch=True, speed_only=True)
"""
shape: (10, 10, 16, 16), (5, 10, 8, 8)
num threads 1 2 4
// kern 5.54e-03s 3.12e-03s 1.99e-03s
// batch 4.22e-03s 1.59e-03s 1.25e-03s
// kern_batch3-5-03s 2.51e-03s 9.15e-04s
"""
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论