Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
2f4e666c
提交
2f4e666c
authored
2月 11, 2014
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1690 from AlOa/elemwise_openmp
Elemwise openmp
上级
8e9ebc8f
1de5723a
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
320 行增加
和
58 行删除
+320
-58
config.txt
doc/library/config.txt
+7
-0
index.txt
doc/tutorial/index.txt
+1
-0
multi_cores.txt
doc/tutorial/multi_cores.txt
+31
-0
configdefaults.py
theano/configdefaults.py
+8
-0
elemwise_openmp_speedup.py
theano/misc/elemwise_openmp_speedup.py
+54
-0
elemwise_time_test.py
theano/misc/elemwise_time_test.py
+59
-0
elemwise.py
theano/tensor/elemwise.py
+48
-23
elemwise_cgen.py
theano/tensor/elemwise_cgen.py
+106
-34
test_elemwise.py
theano/tensor/tests/test_elemwise.py
+6
-1
没有找到文件。
doc/library/config.txt
浏览文件 @
2f4e666c
...
@@ -211,6 +211,13 @@ import theano and print the config variable, as in:
...
@@ -211,6 +211,13 @@ import theano and print the config variable, as in:
The best is to define it via Theano configuration
The best is to define it via Theano configuration
file or with the environment variable THEANO_FLAGS.
file or with the environment variable THEANO_FLAGS.
.. attribute:: openmp_elemwise_minsize
Positive int value, default: 200000.
This specifies the vectors minimum size for which elemwise ops
use openmp, if openmp is enable.
.. attribute:: cast_policy
.. attribute:: cast_policy
String value: either 'numpy+floatX' or 'custom'
String value: either 'numpy+floatX' or 'custom'
...
...
doc/tutorial/index.txt
浏览文件 @
2f4e666c
...
@@ -45,3 +45,4 @@ you out.
...
@@ -45,3 +45,4 @@ you out.
extending_theano
extending_theano
faq
faq
python-memory-management
python-memory-management
multi_cores
doc/tutorial/multi_cores.txt
0 → 100644
浏览文件 @
2f4e666c
=============================
Multi cores support in Theano
=============================
Parallel element wise op with openmp
====================================
Beacuse element wise ops work on every tensor entry indipedently they can be
easly parallelized using openmp.
To use openmp you must set the openmp flag in Theano configuration.
Yuo can use the flag openmp_elemwise_minsize to set the minimum tensor size
for which the operation is parallelized because for short tensor using opemp
can slow down the operation.
If it is no specified the default value (200000) is used.
For simple(fast) operation you can obtain a speed up for very long tensor
while for more complex operation you ca obtain a good speed up also for not
too long tensor.
There is a script (elemwise_openmp_speedup.py in theano/misc/) which you can
use to choose that value for your machine.
The script run two elemwise operation (a fast and a slow one) for a vector of
size openmp_elemwise_minsize with and without openmp and show the time
difference between the two cases.
theano/configdefaults.py
浏览文件 @
2f4e666c
...
@@ -475,3 +475,11 @@ AddConfigVar('openmp',
...
@@ -475,3 +475,11 @@ AddConfigVar('openmp',
BoolParam
(
default_openmp
),
BoolParam
(
default_openmp
),
in_c_key
=
False
,
in_c_key
=
False
,
)
)
AddConfigVar
(
'openmp_elemwise_minsize'
,
"If OpenMP is enable, this is the minimum size of vector "
"for which the openmp parallel for is enable."
"Used in element wise ops"
,
IntParam
(
200000
),
in_c_key
=
False
,
)
theano/misc/elemwise_openmp_speedup.py
0 → 100644
浏览文件 @
2f4e666c
import
os
import
subprocess
import
sys
from
optparse
import
OptionParser
import
theano
parser
=
OptionParser
(
usage
=
'
%
prog <options>
\n
Compute time for'
' fast and slow elemwise operations'
)
parser
.
add_option
(
'-N'
,
'--N'
,
action
=
'store'
,
dest
=
'N'
,
default
=
theano
.
config
.
openmp_elemwise_minsize
,
type
=
"int"
,
help
=
"Number of vector element"
)
def
runScript
(
N
):
script
=
'elemwise_time_test.py'
dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
proc
=
subprocess
.
Popen
([
'python'
,
script
,
'--script'
,
'-N'
,
str
(
N
)],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
cwd
=
dir
)
(
out
,
err
)
=
proc
.
communicate
()
if
err
:
print
err
sys
.
exit
()
return
map
(
float
,
out
.
split
(
" "
))
if
__name__
==
'__main__'
:
options
,
arguments
=
parser
.
parse_args
(
sys
.
argv
)
if
hasattr
(
options
,
"help"
):
print
options
.
help
sys
.
exit
(
0
)
orig_flags
=
os
.
environ
.
get
(
'THEANO_FLAGS'
,
''
)
os
.
environ
[
'THEANO_FLAGS'
]
=
orig_flags
+
',openmp=false'
(
cheapTime
,
costlyTime
)
=
runScript
(
N
=
options
.
N
)
os
.
environ
[
'THEANO_FLAGS'
]
=
orig_flags
+
',openmp=true'
(
cheapTimeOpenmp
,
costlyTimeOpenmp
)
=
runScript
(
N
=
options
.
N
)
if
cheapTime
>
cheapTimeOpenmp
:
cheapSpeed
=
cheapTime
/
cheapTimeOpenmp
cheapSpeedstring
=
"speedup"
else
:
cheapSpeed
=
cheapTimeOpenmp
/
cheapTime
cheapSpeedstring
=
"slowdown"
if
costlyTime
>
costlyTimeOpenmp
:
costlySpeed
=
costlyTime
/
costlyTimeOpenmp
costlySpeedstring
=
"speedup"
else
:
costlySpeed
=
costlyTimeOpenmp
/
costlyTime
costlySpeedstring
=
"slowdown"
print
"Fast op time without openmp
%
fs with openmp
%
fs
%
s
%2.2
f"
%
(
cheapTime
,
cheapTimeOpenmp
,
cheapSpeedstring
,
cheapSpeed
)
print
"Slow op time without openmp
%
fs with openmp
%
fs
%
s
%2.2
f"
%
(
costlyTime
,
costlyTimeOpenmp
,
costlySpeedstring
,
costlySpeed
)
theano/misc/elemwise_time_test.py
0 → 100644
浏览文件 @
2f4e666c
from
optparse
import
OptionParser
import
sys
import
time
import
numpy
as
np
import
theano
import
theano.tensor
as
T
parser
=
OptionParser
(
usage
=
'
%
prog <options>
\n
Compute time for'
' fast and slow elemwise operations'
)
parser
.
add_option
(
'-N'
,
'--N'
,
action
=
'store'
,
dest
=
'N'
,
default
=
theano
.
config
.
openmp_elemwise_minsize
,
type
=
"int"
,
help
=
"Number of vector element"
)
parser
.
add_option
(
'--script'
,
action
=
'store_true'
,
dest
=
'script'
,
default
=
False
,
help
=
"Run program as script and print results on stdoutput"
)
def
evalTime
(
f
,
v
,
script
=
False
,
loops
=
1000
):
min
=
1e10
for
i
in
xrange
(
0
,
loops
):
t0
=
time
.
time
()
f
(
v
)
dt
=
time
.
time
()
-
t0
min
=
dt
if
dt
<
min
else
min
if
not
script
:
print
' run time in
%
d loops was
%2.9
f sec'
%
(
loops
,
min
)
return
min
def
ElemwiseOpTime
(
N
,
script
=
False
,
loops
=
1000
):
x
=
T
.
vector
(
'x'
)
np
.
random
.
seed
(
1235
)
v
=
np
.
random
.
random
(
N
)
.
astype
(
theano
.
config
.
floatX
)
f
=
theano
.
function
([
x
],
2
*
x
+
x
*
x
)
f1
=
theano
.
function
([
x
],
T
.
tanh
(
x
))
if
not
script
:
if
theano
.
config
.
openmp
:
print
"With openmp:"
print
"Fast op "
,
ceapTime
=
evalTime
(
f
,
v
,
script
=
script
,
loops
=
loops
)
if
not
script
:
print
"Slow op "
,
costlyTime
=
evalTime
(
f1
,
v
,
script
=
script
,
loops
=
loops
)
return
(
ceapTime
,
costlyTime
)
if
__name__
==
'__main__'
:
options
,
arguments
=
parser
.
parse_args
(
sys
.
argv
)
if
hasattr
(
options
,
"help"
):
print
options
.
help
sys
.
exit
(
0
)
(
cheapTime
,
costlyTime
)
=
ElemwiseOpTime
(
N
=
options
.
N
,
script
=
options
.
script
)
if
options
.
script
:
sys
.
stdout
.
write
(
"
%2.9
f
%2.9
f
\n
"
%
(
cheapTime
,
costlyTime
))
sys
.
stdout
.
flush
()
theano/tensor/elemwise.py
浏览文件 @
2f4e666c
...
@@ -6,7 +6,7 @@ import numpy
...
@@ -6,7 +6,7 @@ import numpy
import
theano
import
theano
from
theano
import
gof
from
theano
import
gof
from
theano.gof
import
Apply
,
Op
from
theano.gof
import
Apply
,
Op
,
OpenMPOp
from
theano
import
scalar
from
theano
import
scalar
from
theano.scalar
import
Scalar
,
get_scalar_type
from
theano.scalar
import
Scalar
,
get_scalar_type
from
theano.printing
import
pprint
from
theano.printing
import
pprint
...
@@ -419,7 +419,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle),
...
@@ -419,7 +419,7 @@ pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle),
### Elemwise ###
### Elemwise ###
################
################
class
Elemwise
(
Op
):
class
Elemwise
(
Op
enMPOp
):
"""
"""
Generalizes a scalar op to tensors.
Generalizes a scalar op to tensors.
...
@@ -449,7 +449,7 @@ class Elemwise(Op):
...
@@ -449,7 +449,7 @@ class Elemwise(Op):
"""
"""
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
None
,
name
=
None
,
def
__init__
(
self
,
scalar_op
,
inplace_pattern
=
None
,
name
=
None
,
nfunc_spec
=
None
):
nfunc_spec
=
None
,
openmp
=
None
):
"""
"""
Usage: Elemwise(scalar_op, inplace_pattern = {})
Usage: Elemwise(scalar_op, inplace_pattern = {})
...
@@ -487,6 +487,7 @@ class Elemwise(Op):
...
@@ -487,6 +487,7 @@ class Elemwise(Op):
#precompute the hash of this node
#precompute the hash of this node
self
.
_rehash
()
self
.
_rehash
()
super
(
Elemwise
,
self
)
.
__init__
(
openmp
=
openmp
)
def
__getstate__
(
self
):
def
__getstate__
(
self
):
d
=
copy
(
self
.
__dict__
)
d
=
copy
(
self
.
__dict__
)
...
@@ -1028,14 +1029,6 @@ class Elemwise(Op):
...
@@ -1028,14 +1029,6 @@ class Elemwise(Op):
# which is allocated, OR, if there are any aliased outputs,
# which is allocated, OR, if there are any aliased outputs,
# the index of the last of these aliased outputs.
# the index of the last of these aliased outputs.
# We declare the scalar variables used in the inner loop to do
# the element-wise computation. Aliased scalar variables need
# not be declared, as they are #defined in defines
task_decl
=
""
.
join
([
"
%
s&
%
s_i = *
%
s_iter;
\n
"
%
(
dtype
,
name
,
name
)
for
name
,
dtype
in
izip
(
inames
+
list
(
real_onames
),
idtypes
+
list
(
real_odtypes
))])
# We generate the C code of the inner loop using the scalar op
# We generate the C code of the inner loop using the scalar op
task_code
=
self
.
scalar_op
.
c_code
(
task_code
=
self
.
scalar_op
.
c_code
(
Apply
(
self
.
scalar_op
,
Apply
(
self
.
scalar_op
,
...
@@ -1050,11 +1043,13 @@ class Elemwise(Op):
...
@@ -1050,11 +1043,13 @@ class Elemwise(Op):
code
=
"""
code
=
"""
{
{
%(defines)
s
%(defines)
s
%(task_decl)
s
%(task_code)
s
%(task_code)
s
%(undefs)
s
%(undefs)
s
}
}
"""
%
locals
()
"""
%
locals
()
loop_orders
=
orders
+
[
range
(
nnested
)]
*
len
(
real_onames
)
dtypes
=
(
idtypes
+
list
(
real_odtypes
))
if
all
([
o
.
ndim
<=
1
for
o
in
node
.
outputs
]
or
if
all
([
o
.
ndim
<=
1
for
o
in
node
.
outputs
]
or
# Use simpler code when output ndim == 0 or 1
# Use simpler code when output ndim == 0 or 1
# or for broadcated scalar.
# or for broadcated scalar.
...
@@ -1063,19 +1058,47 @@ class Elemwise(Op):
...
@@ -1063,19 +1058,47 @@ class Elemwise(Op):
all_code
=
[(
""
,
""
)]
*
(
nnested
-
1
)
+
[(
""
,
code
)]
+
[
""
]
all_code
=
[(
""
,
""
)]
*
(
nnested
-
1
)
+
[(
""
,
code
)]
+
[
""
]
else
:
else
:
all_code
=
[
code
]
all_code
=
[
code
]
if
len
(
all_code
)
==
1
:
loop
=
cgen
.
make_loop
(
#No loops
loop_orders
=
orders
+
[
range
(
nnested
)]
*
len
(
real_onames
),
task_decl
=
""
.
join
([
dtypes
=
(
idtypes
+
list
(
real_odtypes
)),
"
%
s&
%
s_i = *
%
s_iter;
\n
"
%
(
dtype
,
name
,
name
)
loop_tasks
=
all_code
,
for
name
,
dtype
in
izip
(
inames
+
list
(
real_onames
),
sub
=
sub
)
idtypes
+
list
(
real_odtypes
))])
preloops
=
{}
for
i
,
(
loop_order
,
dtype
)
in
enumerate
(
zip
(
loop_orders
,
dtypes
)):
for
j
,
index
in
enumerate
(
loop_order
):
if
index
!=
'x'
:
preloops
.
setdefault
(
j
,
""
)
preloops
[
j
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
break
else
:
# all broadcastable
preloops
.
setdefault
(
0
,
""
)
preloops
[
0
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
init_array
=
preloops
.
get
(
0
,
" "
)
loop
=
"""
{
%(defines)
s
%(init_array)
s
%(task_decl)
s
%(task_code)
s
%(undefs)
s
}
"""
%
locals
()
else
:
loop
=
cgen
.
make_loop
(
loop_orders
=
loop_orders
,
dtypes
=
dtypes
,
loop_tasks
=
all_code
,
sub
=
sub
,
openmp
=
self
.
openmp
)
else
:
else
:
loop
=
cgen
.
make_reordered_loop
(
loop
=
cgen
.
make_reordered_loop
(
init_loop_orders
=
orders
+
[
range
(
nnested
)]
*
len
(
real_onames
)
,
init_loop_orders
=
loop_orders
,
olv_index
=
olv_index
,
olv_index
=
olv_index
,
dtypes
=
(
idtypes
+
list
(
real_odtypes
))
,
dtypes
=
dtypes
,
inner_task
=
code
,
inner_task
=
code
,
sub
=
sub
)
sub
=
sub
,
openmp
=
self
.
openmp
)
# If all inputs and outputs are contiguous
# If all inputs and outputs are contiguous
# and the scalar op define optimized code for that case
# and the scalar op define optimized code for that case
...
@@ -1117,7 +1140,8 @@ class Elemwise(Op):
...
@@ -1117,7 +1140,8 @@ class Elemwise(Op):
contig
+=
"""
contig
+=
"""
dtype_
%(x)
s&
%(x)
s_i = ((dtype_
%(x)
s*) PyArray_DATA(
%(x)
s))[0];
dtype_
%(x)
s&
%(x)
s_i = ((dtype_
%(x)
s*) PyArray_DATA(
%(x)
s))[0];
"""
%
locals
()
"""
%
locals
()
if
self
.
openmp
:
contig
+=
"""#pragma omp parallel for if(n>=
%
d)"""
%
(
config
.
openmp_elemwise_minsize
)
contig
+=
"""
contig
+=
"""
for(int i=0; i<n; i++){
for(int i=0; i<n; i++){
%(index)
s
%(index)
s
...
@@ -1166,6 +1190,7 @@ class Elemwise(Op):
...
@@ -1166,6 +1190,7 @@ class Elemwise(Op):
version
.
append
(
self
.
scalar_op
.
c_code_cache_version_apply
(
scalar_node
))
version
.
append
(
self
.
scalar_op
.
c_code_cache_version_apply
(
scalar_node
))
for
i
in
node
.
inputs
+
node
.
outputs
:
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
append
(
get_scalar_type
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
append
(
get_scalar_type
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
append
((
'openmp'
,
self
.
openmp
))
if
all
(
version
):
if
all
(
version
):
return
tuple
(
version
)
return
tuple
(
version
)
else
:
else
:
...
@@ -1557,7 +1582,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
...
@@ -1557,7 +1582,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
+
[(
""
,
code1
),
""
])
+
[(
""
,
code1
),
""
])
else
:
else
:
all_code
=
[
task0_decl
+
code1
]
all_code
=
[
task0_decl
+
code1
]
loop
=
cgen
.
make_loop
(
loop
=
cgen
.
make_loop
_careduce
(
[
order
,
range
(
nnested
)
+
[
'x'
]
*
len
(
axis
)],
[
order
,
range
(
nnested
)
+
[
'x'
]
*
len
(
axis
)],
[
idtype
,
adtype
],
all_code
,
sub
)
[
idtype
,
adtype
],
all_code
,
sub
)
...
...
theano/tensor/elemwise_cgen.py
浏览文件 @
2f4e666c
import
theano
def
make_declare
(
loop_orders
,
dtypes
,
sub
):
def
make_declare
(
loop_orders
,
dtypes
,
sub
):
...
@@ -170,8 +171,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
...
@@ -170,8 +171,7 @@ def make_alloc(loop_orders, dtype, sub, fortran='0'):
}
}
"""
%
dict
(
locals
(),
**
sub
)
"""
%
dict
(
locals
(),
**
sub
)
def
make_loop
(
loop_orders
,
dtypes
,
loop_tasks
,
sub
,
openmp
=
None
):
def
make_loop
(
loop_orders
,
dtypes
,
loop_tasks
,
sub
):
"""
"""
Make a nested loop over several arrays and associate specific code
Make a nested loop over several arrays and associate specific code
to each level of nesting.
to each level of nesting.
...
@@ -195,22 +195,29 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
...
@@ -195,22 +195,29 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
@type sub: a dictionary.
@type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name.
@param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
The 'lvi' variable corresponds to the ith element of loop_orders.
"""
"""
def
loop_over
(
preloop
,
code
,
indices
,
i
):
def
loop_over
(
preloop
,
code
,
indices
,
i
):
iterv
=
'ITER_
%
i'
%
i
iterv
=
'ITER_
%
i'
%
i
update
=
""
update
=
""
suitable_n
=
"1"
suitable_n
=
"1"
for
j
,
index
in
enumerate
(
indices
):
for
j
,
index
in
enumerate
(
indices
):
var
=
sub
[
'lv
%
i'
%
j
]
var
=
sub
[
'lv
%
i'
%
j
]
update
+=
"
%(var)
s_iter +=
%(var)
s_jump
%(index)
s_
%(i)
s;
\n
"
%
locals
()
dtype
=
dtypes
[
j
]
update
+=
"
%(dtype)
s &
%(var)
s_i = * (
%(var)
s_iter +
%(iterv)
s *
%(var)
s_jump
%(index)
s_
%(i)
s );
\n
"
%
locals
()
if
index
!=
'x'
:
if
index
!=
'x'
:
suitable_n
=
"
%(var)
s_n
%(index)
s"
%
locals
()
suitable_n
=
"
%(var)
s_n
%(index)
s"
%
locals
()
return
"""
if
openmp
:
openmp_elemwise_minsize
=
theano
.
config
.
openmp_elemwise_minsize
forloop
=
"""#pragma omp parallel for if(
%(suitable_n)
s >=
%(openmp_elemwise_minsize)
s)
\n
"""
%
locals
()
else
:
forloop
=
""
forloop
+=
"""for (int
%(iterv)
s = 0;
%(iterv)
s<
%(suitable_n)
s;
%(iterv)
s++)"""
%
locals
()
return
"""
%(preloop)
s
%(preloop)
s
for (int
%(iterv)
s =
%(suitable_n)
s;
%(iterv)
s;
%(iterv)
s--) {
%(forloop)
s {
%(code)
s
%(update)
s
%(update)
s
%(code)
s
}
}
"""
%
locals
()
"""
%
locals
()
...
@@ -225,18 +232,16 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
...
@@ -225,18 +232,16 @@ def make_loop(loop_orders, dtypes, loop_tasks, sub):
preloops
.
setdefault
(
0
,
""
)
preloops
.
setdefault
(
0
,
""
)
preloops
[
0
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
preloops
[
0
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
if
len
(
loop_tasks
)
==
1
:
s
=
""
s
=
preloops
.
get
(
0
,
""
)
else
:
for
i
,
(
pre_task
,
task
),
indices
in
reversed
(
zip
(
xrange
(
len
(
loop_tasks
)
-
1
),
loop_tasks
,
zip
(
*
loop_orders
))):
s
=
""
for
i
,
(
pre_task
,
task
),
indices
in
reversed
(
zip
(
xrange
(
len
(
loop_tasks
)
-
1
),
loop_tasks
,
zip
(
*
loop_orders
))):
s
=
loop_over
(
preloops
.
get
(
i
,
""
)
+
pre_task
,
s
+
task
,
indices
,
i
)
s
=
loop_over
(
preloops
.
get
(
i
,
""
)
+
pre_task
,
s
+
task
,
indices
,
i
)
s
+=
loop_tasks
[
-
1
]
s
+=
loop_tasks
[
-
1
]
return
"{
%
s}"
%
s
return
"{
%
s}"
%
s
def
make_reordered_loop
(
init_loop_orders
,
olv_index
,
dtypes
,
inner_task
,
sub
):
def
make_reordered_loop
(
init_loop_orders
,
olv_index
,
dtypes
,
inner_task
,
sub
,
openmp
=
None
):
'''A bit like make_loop, but when only the inner-most loop executes code.
'''A bit like make_loop, but when only the inner-most loop executes code.
All the loops will be reordered so that the loops over the output tensor
All the loops will be reordered so that the loops over the output tensor
...
@@ -325,7 +330,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
...
@@ -325,7 +330,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
++
%(ovar)
s_loops_it;
++
%(ovar)
s_loops_it;
"""
%
locals
()
"""
%
locals
()
## Get sorted strides
and jumps
## Get sorted strides
# Get strides in the initial order
# Get strides in the initial order
def
get_loop_strides
(
loop_order
,
i
):
def
get_loop_strides
(
loop_order
,
i
):
"""
"""
...
@@ -344,7 +349,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
...
@@ -344,7 +349,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
return
r
return
r
# We declare the initial strides as a 2D array, nvars x nnested
# We declare the initial strides as a 2D array, nvars x nnested
declare_strides
_jumps
=
"""
declare_strides
=
"""
int init_strides[
%(nvars)
i][
%(nnested)
i] = {
int init_strides[
%(nvars)
i][
%(nnested)
i] = {
%(strides)
s
%(strides)
s
};"""
%
dict
(
};"""
%
dict
(
...
@@ -355,46 +360,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
...
@@ -355,46 +360,57 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
for
i
,
lo
in
enumerate
(
init_loop_orders
)
for
i
,
lo
in
enumerate
(
init_loop_orders
)
if
len
(
lo
)
>
0
))
if
len
(
lo
)
>
0
))
# Declare (sorted) stride and
jumps
for each variable
# Declare (sorted) stride and for each variable
# we iterate from innermost loop to outermost loop
# we iterate from innermost loop to outermost loop
declare_strides
_jumps
+=
"""
declare_strides
+=
"""
std::vector< std::pair<int, int> >::reverse_iterator
%(ovar)
s_loops_rit;
std::vector< std::pair<int, int> >::reverse_iterator
%(ovar)
s_loops_rit;
"""
%
locals
()
"""
%
locals
()
for
i
in
xrange
(
nvars
):
for
i
in
xrange
(
nvars
):
var
=
sub
[
"lv
%
i"
%
i
]
var
=
sub
[
"lv
%
i"
%
i
]
declare_strides
_jumps
+=
"""
declare_strides
+=
"""
%(ovar)
s_loops_rit =
%(ovar)
s_loops.rbegin();"""
%
locals
()
%(ovar)
s_loops_rit =
%(ovar)
s_loops.rbegin();"""
%
locals
()
adjust
=
"0"
for
j
in
reversed
(
range
(
nnested
)):
for
j
in
reversed
(
range
(
nnested
)):
jump
=
"(
%
s) - (
%
s)"
%
(
"
%(var)
s_stride_l
%(j)
i"
%
locals
(),
adjust
)
declare_strides
+=
"""
declare_strides_jumps
+=
"""
int
%(var)
s_stride_l
%(j)
i = init_strides[
%(i)
i][
%(ovar)
s_loops_rit->second];
int
%(var)
s_stride_l
%(j)
i = init_strides[
%(i)
i][
%(ovar)
s_loops_rit->second];
int
%(var)
s_jump_l
%(j)
i =
%(jump)
s;
++
%(ovar)
s_loops_rit;
++
%(ovar)
s_loops_rit;
"""
%
locals
()
"""
%
locals
()
adjust
=
"TOTAL_
%(j)
i *
%(var)
s_stride_l
%(j)
i"
%
locals
()
declare_iter
=
""
declare_iter
=
""
for
i
,
dtype
in
enumerate
(
dtypes
):
for
i
,
dtype
in
enumerate
(
dtypes
):
var
=
sub
[
"lv
%
i"
%
i
]
var
=
sub
[
"lv
%
i"
%
i
]
declare_iter
+=
"
%(var)
s_iter = (
%(dtype)
s*)(PyArray_DATA(
%(var)
s));
\n
"
%
locals
()
declare_iter
+=
"
%(var)
s_iter = (
%(dtype)
s*)(PyArray_DATA(
%(var)
s));
\n
"
%
locals
()
pointer_update
=
''
for
j
,
dtype
in
enumerate
(
dtypes
):
var
=
sub
[
"lv
%
i"
%
j
]
pointer_update
+=
"
%(dtype)
s &
%(var)
s_i = * (
%(var)
s_iter"
%
locals
()
tot_jump
=
''
for
i
in
reversed
(
range
(
nnested
)):
iterv
=
'ITER_
%
i'
%
i
pointer_update
+=
"+
%(var)
s_stride_l
%(i)
i*
%(iterv)
s"
%
locals
()
pointer_update
+=
");
\n
"
loop
=
inner_task
loop
=
inner_task
for
i
in
reversed
(
range
(
nnested
)):
for
i
in
reversed
(
range
(
nnested
)):
iterv
=
'ITER_
%
i'
%
i
iterv
=
'ITER_
%
i'
%
i
total
=
'TOTAL_
%
i'
%
i
total
=
'TOTAL_
%
i'
%
i
update
=
''
update
=
''
for
j
in
xrange
(
nvars
):
forloop
=
''
var
=
sub
[
"lv
%
i"
%
j
]
# The pointers are defined only in the most inner loop
update
+=
"
%(var)
s_iter +=
%(var)
s_jump_l
%(i)
i;
\n
"
%
locals
()
if
i
==
nnested
-
1
:
update
=
pointer_update
if
i
==
0
:
if
openmp
:
openmp_elemwise_minsize
=
theano
.
config
.
openmp_elemwise_minsize
forloop
+=
"""#pragma omp parallel for if(
%(total)
s >=
%(openmp_elemwise_minsize)
s)
\n
"""
%
locals
()
forloop
+=
"for(int
%(iterv)
s = 0;
%(iterv)
s<
%(total)
s;
%(iterv)
s++)"
%
locals
()
loop
=
"""
loop
=
"""
for (int
%(iterv)
s =
%(total)
s;
%(iterv)
s;
%(iterv)
s--)
%(forloop)
s
{ // begin loop
%(i)
i
{ // begin loop
%(i)
i
%(loop)
s
%(update)
s
%(update)
s
%(loop)
s
} // end loop
%(i)
i
} // end loop
%(i)
i
"""
%
locals
()
"""
%
locals
()
...
@@ -402,7 +418,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
...
@@ -402,7 +418,7 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
'{'
,
'{'
,
order_loops
,
order_loops
,
declare_totals
,
declare_totals
,
declare_strides
_jumps
,
declare_strides
,
declare_iter
,
declare_iter
,
loop
,
loop
,
'}
\n
'
,
'}
\n
'
,
...
@@ -435,21 +451,77 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
...
@@ -435,21 +451,77 @@ def make_reordered_loop(init_loop_orders, olv_index, dtypes, inner_task, sub):
### DimShuffle ###
### DimShuffle ###
##################
##################
#################
#################
### Broadcast ###
### Broadcast ###
#################
#################
################
################
### CAReduce ###
### CAReduce ###
################
################
def
make_loop_careduce
(
loop_orders
,
dtypes
,
loop_tasks
,
sub
):
"""
Make a nested loop over several arrays and associate specific code
to each level of nesting.
@type loop_orders: list of N tuples of length M.
@param loop_orders: Each value of each
tuple can be either the index of a dimension to loop over or
the letter 'x' which means there is no looping to be done
over that variable at that point (in other words we broadcast
over that dimension). If an entry is an integer, it will become
an alias of the entry of that rank.
@type loop_tasks: list of M+1 pieces of code.
@param loop_tasks: The ith loop_task is a pair of strings, the first
string is code to be executed before the ith loop starts, the second
one contains code to be executed just before going to the next element
of the ith dimension.
The last element if loop_tasks is a single string, containing code
to be executed at the very end.
@type sub: a dictionary.
@param sub: Maps 'lv#' to a suitable variable name.
The 'lvi' variable corresponds to the ith element of loop_orders.
"""
def
loop_over
(
preloop
,
code
,
indices
,
i
):
iterv
=
'ITER_
%
i'
%
i
update
=
""
suitable_n
=
"1"
for
j
,
index
in
enumerate
(
indices
):
var
=
sub
[
'lv
%
i'
%
j
]
update
+=
"
%(var)
s_iter +=
%(var)
s_jump
%(index)
s_
%(i)
s;
\n
"
%
locals
()
if
index
!=
'x'
:
suitable_n
=
"
%(var)
s_n
%(index)
s"
%
locals
()
return
"""
%(preloop)
s
for (int
%(iterv)
s =
%(suitable_n)
s;
%(iterv)
s;
%(iterv)
s--) {
%(code)
s
%(update)
s
}
"""
%
locals
()
preloops
=
{}
for
i
,
(
loop_order
,
dtype
)
in
enumerate
(
zip
(
loop_orders
,
dtypes
)):
for
j
,
index
in
enumerate
(
loop_order
):
if
index
!=
'x'
:
preloops
.
setdefault
(
j
,
""
)
preloops
[
j
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
break
else
:
# all broadcastable
preloops
.
setdefault
(
0
,
""
)
preloops
[
0
]
+=
(
"
%%
(lv
%(i)
s)s_iter = (
%(dtype)
s*)(PyArray_DATA(
%%
(lv
%(i)
s)s));
\n
"
%
locals
())
%
sub
if
len
(
loop_tasks
)
==
1
:
s
=
preloops
.
get
(
0
,
""
)
else
:
s
=
""
for
i
,
(
pre_task
,
task
),
indices
in
reversed
(
zip
(
xrange
(
len
(
loop_tasks
)
-
1
),
loop_tasks
,
zip
(
*
loop_orders
))):
s
=
loop_over
(
preloops
.
get
(
i
,
""
)
+
pre_task
,
s
+
task
,
indices
,
i
)
s
+=
loop_tasks
[
-
1
]
return
"{
%
s}"
%
s
theano/tensor/tests/test_elemwise.py
浏览文件 @
2f4e666c
...
@@ -16,7 +16,7 @@ from theano.compile.mode import get_default_mode
...
@@ -16,7 +16,7 @@ from theano.compile.mode import get_default_mode
from
theano.tensor.elemwise
import
(
CAReduce
,
Elemwise
,
DimShuffle
,
from
theano.tensor.elemwise
import
(
CAReduce
,
Elemwise
,
DimShuffle
,
Prod
,
ProdWithoutZeros
)
Prod
,
ProdWithoutZeros
)
from
theano.tests
import
unittest_tools
from
theano.tests
import
unittest_tools
import
math
def
FunctionGraph
(
i
,
o
):
def
FunctionGraph
(
i
,
o
):
e
=
gof
.
FunctionGraph
(
i
,
o
)
e
=
gof
.
FunctionGraph
(
i
,
o
)
...
@@ -145,6 +145,9 @@ class test_Broadcast(unittest.TestCase):
...
@@ -145,6 +145,9 @@ class test_Broadcast(unittest.TestCase):
ctype
=
TensorType
ctype
=
TensorType
cop
=
Elemwise
cop
=
Elemwise
openmp_minsize
=
2
*
config
.
openmp_elemwise_minsize
openmp_minsize_sqrt
=
math
.
ceil
(
math
.
sqrt
(
openmp_minsize
))
def
rand_val
(
self
,
shp
):
def
rand_val
(
self
,
shp
):
return
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shp
))
return
numpy
.
asarray
(
numpy
.
random
.
rand
(
*
shp
))
...
@@ -160,6 +163,8 @@ class test_Broadcast(unittest.TestCase):
...
@@ -160,6 +163,8 @@ class test_Broadcast(unittest.TestCase):
((
3
,
5
),
(
3
,
1
)),
((
3
,
5
),
(
3
,
1
)),
((
1
,
5
),
(
5
,
1
)),
((
1
,
5
),
(
5
,
1
)),
((
1
,
1
),
(
1
,
1
)),
((
1
,
1
),
(
1
,
1
)),
((
self
.
openmp_minsize
,),
(
self
.
openmp_minsize
,)),
((
self
.
openmp_minsize_sqrt
,
self
.
openmp_minsize_sqrt
),
(
self
.
openmp_minsize_sqrt
,
self
.
openmp_minsize_sqrt
)),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
)),
((
2
,
3
,
4
,
5
),
(
2
,
3
,
4
,
5
)),
((
2
,
3
,
4
,
5
),
(
1
,
3
,
1
,
5
)),
((
2
,
3
,
4
,
5
),
(
1
,
3
,
1
,
5
)),
((
2
,
3
,
4
,
5
),
(
1
,
1
,
1
,
1
)),
((
2
,
3
,
4
,
5
),
(
1
,
1
,
1
,
1
)),
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论