Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
7a6d676f
提交
7a6d676f
authored
9月 03, 2017
作者:
Frédéric Bastien
提交者:
GitHub
9月 03, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5688 from cooijmanstim/gpuarray-stack-trace
gpuarray: keep stack trace
上级
47ac5f99
9f8b5561
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
352 行增加
和
168 行删除
+352
-168
graph.py
theano/gof/graph.py
+54
-0
opt.py
theano/gof/opt.py
+21
-1
basic_ops.py
theano/gpuarray/basic_ops.py
+3
-2
dnn.py
theano/gpuarray/dnn.py
+19
-12
opt.py
theano/gpuarray/opt.py
+94
-68
opt_util.py
theano/gpuarray/opt_util.py
+7
-3
test_opt.py
theano/gpuarray/tests/test_opt.py
+70
-6
blas.py
theano/tensor/blas.py
+77
-73
opt_uncanonicalize.py
theano/tensor/opt_uncanonicalize.py
+7
-3
没有找到文件。
theano/gof/graph.py
浏览文件 @
7a6d676f
...
@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms.
...
@@ -4,6 +4,7 @@ Node classes (`Apply`, `Variable`) and expression graph algorithms.
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
collections
import
deque
from
collections
import
deque
import
contextlib
from
copy
import
copy
from
copy
import
copy
from
itertools
import
count
from
itertools
import
count
...
@@ -390,6 +391,8 @@ class Variable(Node):
...
@@ -390,6 +391,8 @@ class Variable(Node):
self
.
name
=
name
self
.
name
=
name
self
.
auto_name
=
'auto_'
+
str
(
next
(
self
.
__count__
))
self
.
auto_name
=
'auto_'
+
str
(
next
(
self
.
__count__
))
Variable
.
notify_construction_observers
(
self
)
def
__str__
(
self
):
def
__str__
(
self
):
"""Return a str representation of the Variable.
"""Return a str representation of the Variable.
...
@@ -536,6 +539,22 @@ class Variable(Node):
...
@@ -536,6 +539,22 @@ class Variable(Node):
d
[
"tag"
]
=
t
d
[
"tag"
]
=
t
return
d
return
d
# refer to doc in nodes_constructed.
construction_observers
=
[]
@classmethod
def
append_construction_observer
(
cls
,
observer
):
cls
.
construction_observers
.
append
(
observer
)
@classmethod
def
remove_construction_observer
(
cls
,
observer
):
cls
.
construction_observers
.
remove
(
observer
)
@classmethod
def
notify_construction_observers
(
cls
,
instance
):
for
observer
in
cls
.
construction_observers
:
observer
(
instance
)
class
Constant
(
Variable
):
class
Constant
(
Variable
):
"""
"""
...
@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node):
...
@@ -1426,3 +1445,38 @@ def is_in_ancestors(l_node, f_node):
todo
.
append
(
cur
)
todo
.
append
(
cur
)
todo
.
extend
(
i
.
owner
for
i
in
cur
.
inputs
if
i
.
owner
)
todo
.
extend
(
i
.
owner
for
i
in
cur
.
inputs
if
i
.
owner
)
return
False
return
False
@contextlib.contextmanager
def
nodes_constructed
():
"""
A contextmanager that is used in inherit_stack_trace and keeps track
of all the newly created varaible nodes inside an optimization. A list
of new_nodes is instantiated but will be filled in a lazy manner (when
Variable.notify_construction_observers is called).
`observer` is the entity that updates the new_nodes list.
construction_observers is a list inside Variable class and contains
a list of observer functions. The observer functions inside
construction_observers are only called when a variable node is
instantiated (where Variable.notify_construction_observers is called).
When the observer function is called, a new variable node is added to
the new_nodes list.
Parameters
----------
new_nodes
A list of all the variable nodes that are created inside the optimization.
yields
new_nodes list.
"""
new_nodes
=
[]
def
observer
(
node
):
new_nodes
.
append
(
node
)
Variable
.
append_construction_observer
(
observer
)
yield
new_nodes
Variable
.
remove_construction_observer
(
observer
)
theano/gof/opt.py
浏览文件 @
7a6d676f
...
@@ -6,6 +6,7 @@ amount of useful generic optimization tools.
...
@@ -6,6 +6,7 @@ amount of useful generic optimization tools.
from
__future__
import
absolute_import
,
print_function
,
division
from
__future__
import
absolute_import
,
print_function
,
division
from
collections
import
deque
,
defaultdict
,
OrderedDict
from
collections
import
deque
,
defaultdict
,
OrderedDict
import
contextlib
import
copy
import
copy
import
inspect
import
inspect
import
logging
import
logging
...
@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
...
@@ -2902,7 +2903,7 @@ def pre_greedy_local_optimizer(list_optimizations, out):
def
copy_stack_trace
(
from_var
,
to_var
):
def
copy_stack_trace
(
from_var
,
to_var
):
"""
"""
Copies the stack trace from one or more tensor variables to
Copies the stack trace from one or more tensor variables to
one or more tensor variables.
one or more tensor variables
and returns the destination variables
.
Parameters
Parameters
----------
----------
...
@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var):
...
@@ -2946,6 +2947,25 @@ def copy_stack_trace(from_var, to_var):
# Copy over stack traces from from_var to each variable to
# Copy over stack traces from from_var to each variable to
# to_var, including the stack_trace of the to_var before
# to_var, including the stack_trace of the to_var before
to_var
.
tag
.
trace
=
getattr
(
to_var
.
tag
,
'trace'
,
[])
+
tr
to_var
.
tag
.
trace
=
getattr
(
to_var
.
tag
,
'trace'
,
[])
+
tr
return
to_var
@contextlib.contextmanager
def
inherit_stack_trace
(
from_var
):
"""
Contextmanager that copies the stack trace from one or more variable nodes to all
variable nodes constructed in the body. new_nodes is the list of all the newly created
variable nodes inside an optimization that is managed by graph.nodes_constructed().
Parameters
----------
from_var
Variable node or a list of variable nodes to copy stack traces from.
"""
with
graph
.
nodes_constructed
()
as
new_nodes
:
yield
copy_stack_trace
(
from_var
,
new_nodes
)
def
check_stack_trace
(
f_or_fgraph
,
ops_to_check
=
'last'
,
bug_print
=
'raise'
):
def
check_stack_trace
(
f_or_fgraph
,
ops_to_check
=
'last'
,
bug_print
=
'raise'
):
...
...
theano/gpuarray/basic_ops.py
浏览文件 @
7a6d676f
...
@@ -15,6 +15,7 @@ from theano.tensor.basic import (
...
@@ -15,6 +15,7 @@ from theano.tensor.basic import (
from
theano.gof
import
HideC
,
COp
,
ParamsType
from
theano.gof
import
HideC
,
COp
,
ParamsType
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.opt
import
copy_stack_trace
from
collections
import
deque
from
collections
import
deque
...
@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):
...
@@ -75,11 +76,11 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor
# If we couldn't deal with transfers, then maybe it's a tensor
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
if
isinstance
(
x
.
type
,
tensor
.
TensorType
):
return
GpuFromHost
(
context_name
)(
x
)
return
copy_stack_trace
(
x
,
GpuFromHost
(
context_name
)(
x
)
)
# Try _as_GpuArrayVariable if possible
# Try _as_GpuArrayVariable if possible
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
if
hasattr
(
x
,
'_as_GpuArrayVariable'
):
return
x
.
_as_GpuArrayVariable
(
context_name
)
return
copy_stack_trace
(
x
,
x
.
_as_GpuArrayVariable
(
context_name
)
)
# If it didn't work try for a constant
# If it didn't work try for a constant
ctx
=
get_context
(
context_name
)
ctx
=
get_context
(
context_name
)
...
...
theano/gpuarray/dnn.py
浏览文件 @
7a6d676f
...
@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
...
@@ -18,6 +18,7 @@ from theano.gradient import DisconnectedType, grad_not_implemented
from
theano.gof
import
Optimizer
,
local_optimizer
,
COp
,
ParamsType
,
EnumList
from
theano.gof
import
Optimizer
,
local_optimizer
,
COp
,
ParamsType
,
EnumList
from
theano.gof.cmodule
import
GCC_compiler
from
theano.gof.cmodule
import
GCC_compiler
from
theano.gof.type
import
CDataType
,
Generic
from
theano.gof.type
import
CDataType
,
Generic
from
theano.gof.opt
import
inherit_stack_trace
from
theano.compile
import
optdb
from
theano.compile
import
optdb
from
theano.compile.ops
import
shape_i
,
shape_i_op
from
theano.compile.ops
import
shape_i
,
shape_i_op
from
theano.tensor.nnet
import
LogSoftmax
,
SoftmaxGrad
from
theano.tensor.nnet
import
LogSoftmax
,
SoftmaxGrad
...
@@ -3127,9 +3128,11 @@ def local_abstractconv_cudnn(node):
...
@@ -3127,9 +3128,11 @@ def local_abstractconv_cudnn(node):
if
node
.
op
.
unshared
:
if
node
.
op
.
unshared
:
return
None
return
None
if
isinstance
(
node
.
op
,
AbstractConv2d
):
if
isinstance
(
node
.
op
,
AbstractConv2d
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
elif
isinstance
(
node
.
op
,
AbstractConv3d
):
elif
isinstance
(
node
.
op
,
AbstractConv3d
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
@local_optimizer
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
@local_optimizer
([
AbstractConv2d
,
AbstractConv2d_gradWeights
,
AbstractConv2d_gradInputs
])
...
@@ -3352,9 +3355,11 @@ def local_abstractconv_gw_cudnn(node):
...
@@ -3352,9 +3355,11 @@ def local_abstractconv_gw_cudnn(node):
if
node
.
op
.
unshared
:
if
node
.
op
.
unshared
:
return
None
return
None
if
isinstance
(
node
.
op
,
AbstractConv2d_gradWeights
):
if
isinstance
(
node
.
op
,
AbstractConv2d_gradWeights
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
elif
isinstance
(
node
.
op
,
AbstractConv3d_gradWeights
):
elif
isinstance
(
node
.
op
,
AbstractConv3d_gradWeights
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
@local_optimizer
([
AbstractConv2d_gradInputs
,
AbstractConv3d_gradInputs
])
@local_optimizer
([
AbstractConv2d_gradInputs
,
AbstractConv3d_gradInputs
])
...
@@ -3365,9 +3370,11 @@ def local_abstractconv_gi_cudnn(node):
...
@@ -3365,9 +3370,11 @@ def local_abstractconv_gi_cudnn(node):
if
node
.
op
.
unshared
:
if
node
.
op
.
unshared
:
return
None
return
None
if
isinstance
(
node
.
op
,
AbstractConv2d_gradInputs
):
if
isinstance
(
node
.
op
,
AbstractConv2d_gradInputs
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
elif
isinstance
(
node
.
op
,
AbstractConv3d_gradInputs
):
elif
isinstance
(
node
.
op
,
AbstractConv3d_gradInputs
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
local_abstractconv3d_cudnn_graph
(
node
.
op
,
ctx
,
node
.
inputs
,
node
.
outputs
)
@inplace_allocempty
(
GpuDnnConv
,
2
)
@inplace_allocempty
(
GpuDnnConv
,
2
)
...
@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs):
...
@@ -3384,7 +3391,6 @@ def local_dnn_convgw_inplace(node, inputs):
def
local_dnn_convgi_inplace
(
node
,
inputs
):
def
local_dnn_convgi_inplace
(
node
,
inputs
):
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
,
num_groups
=
node
.
op
.
num_groups
)(
*
inputs
)]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
,
num_groups
=
node
.
op
.
num_groups
)(
*
inputs
)]
optdb
.
register
(
'local_dnna_conv_inplace'
,
optdb
.
register
(
'local_dnna_conv_inplace'
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
local_dnn_convgw_inplace
,
local_dnn_convgw_inplace
,
...
@@ -3654,11 +3660,12 @@ def local_dnn_reduction(node):
...
@@ -3654,11 +3660,12 @@ def local_dnn_reduction(node):
if
not
cudnn
.
cudnnReduceTensorOp_t
.
has_alias
(
node
.
op
.
scalar_op
.
name
):
if
not
cudnn
.
cudnnReduceTensorOp_t
.
has_alias
(
node
.
op
.
scalar_op
.
name
):
return
return
return
(
GpuDnnReduction
(
node
.
op
.
scalar_op
.
name
,
with
inherit_stack_trace
(
node
.
outputs
):
node
.
op
.
axis
,
return
(
GpuDnnReduction
(
node
.
op
.
scalar_op
.
name
,
node
.
op
.
acc_dtype
,
node
.
op
.
axis
,
node
.
op
.
dtype
,
node
.
op
.
acc_dtype
,
False
)(
node
.
inputs
[
0
]),)
node
.
op
.
dtype
,
False
)(
node
.
inputs
[
0
]),)
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
...
...
theano/gpuarray/opt.py
浏览文件 @
7a6d676f
...
@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
...
@@ -15,7 +15,8 @@ from theano.compile.ops import shape_i
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
TopoOptimizer
,
from
theano.gof
import
(
local_optimizer
,
EquilibriumDB
,
TopoOptimizer
,
LocalGroupDB
,
LocalGroupDB
,
SequenceDB
,
Optimizer
,
DB
,
toolbox
,
graph
)
SequenceDB
,
Optimizer
,
DB
,
toolbox
,
graph
)
from
theano.gof.opt
import
LocalMetaOptimizer
from
theano.gof.opt
import
(
LocalMetaOptimizer
,
copy_stack_trace
,
inherit_stack_trace
)
from
theano.ifelse
import
IfElse
from
theano.ifelse
import
IfElse
from
theano.misc.ordered_set
import
OrderedSet
from
theano.misc.ordered_set
import
OrderedSet
...
@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False):
...
@@ -252,12 +253,25 @@ def op_lifter(OP, cuda_only=False):
# This is needed as sometimes new_op inherits from OP.
# This is needed as sometimes new_op inherits from OP.
if
new_op
and
new_op
!=
node
.
op
:
if
new_op
and
new_op
!=
node
.
op
:
if
isinstance
(
new_op
,
theano
.
Op
):
if
isinstance
(
new_op
,
theano
.
Op
):
return
[
safe_to_cpu
(
o
)
for
o
in
new_outputs
=
new_op
(
*
node
.
inputs
,
return_list
=
True
)
new_op
(
*
node
.
inputs
,
return_list
=
True
)]
to_cpu_fn
=
safe_to_cpu
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
elif
isinstance
(
new_op
,
(
tuple
,
list
)):
return
[
safe_to_cpu
(
o
)
for
o
in
new_op
]
new_outputs
=
new_op
to_cpu_fn
=
safe_to_cpu
else
:
# suppose it is a variable on the GPU
else
:
# suppose it is a variable on the GPU
return
[
new_op
.
transfer
(
'cpu'
)]
new_outputs
=
[
new_op
]
def
to_cpu_fn
(
x
):
return
x
.
transfer
(
'cpu'
)
# copy stack traces onto gpu outputs
# also copy the stack traces onto HostFromGpu outputs
on_cpu
=
[]
for
old_output
,
new_output
in
zip
(
node
.
outputs
,
new_outputs
):
copy_stack_trace
(
old_output
,
new_output
)
cpu
=
to_cpu_fn
(
new_output
)
on_cpu
.
append
(
cpu
)
copy_stack_trace
(
old_output
,
cpu
)
return
on_cpu
return
False
return
False
local_opt
.
__name__
=
maker
.
__name__
local_opt
.
__name__
=
maker
.
__name__
return
local_optimizer
(
OP
)(
local_opt
)
return
local_optimizer
(
OP
)(
local_opt
)
...
@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer):
...
@@ -419,6 +433,9 @@ class GraphToGPU(Optimizer):
elif
isinstance
(
new_ops
,
theano
.
Variable
):
elif
isinstance
(
new_ops
,
theano
.
Variable
):
outputs
=
[
new_ops
]
outputs
=
[
new_ops
]
for
old_output
,
new_output
in
zip
(
node
.
outputs
,
outputs
):
copy_stack_trace
(
old_output
,
new_output
)
if
new_ops
:
if
new_ops
:
node_created
[
lopt
]
+=
len
(
graph
.
ops
([
mapping
[
i
]
for
i
in
node
.
inputs
],
outputs
))
node_created
[
lopt
]
+=
len
(
graph
.
ops
([
mapping
[
i
]
for
i
in
node
.
inputs
],
outputs
))
if
any
([
getattr
(
old_o
,
'dtype'
,
None
)
!=
getattr
(
new_o
,
'dtype'
,
None
)
if
any
([
getattr
(
old_o
,
'dtype'
,
None
)
!=
getattr
(
new_o
,
'dtype'
,
None
)
...
@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer):
...
@@ -451,7 +468,7 @@ class GraphToGPU(Optimizer):
new_o
.
owner
.
inputs
[
0
]
.
type
==
o
.
type
):
new_o
.
owner
.
inputs
[
0
]
.
type
==
o
.
type
):
new_o
=
new_o
.
owner
.
inputs
[
0
]
new_o
=
new_o
.
owner
.
inputs
[
0
]
else
:
else
:
new_o
=
safe_to_cpu
(
new_o
)
new_o
=
copy_stack_trace
(
o
,
safe_to_cpu
(
new_o
)
)
new_nodes
.
append
(
new_o
)
new_nodes
.
append
(
new_o
)
fgraph
.
replace_all_validate
(
zip
(
fgraph
.
outputs
,
new_nodes
),
fgraph
.
replace_all_validate
(
zip
(
fgraph
.
outputs
,
new_nodes
),
reason
=
self
.
__class__
.
__name__
)
reason
=
self
.
__class__
.
__name__
)
...
@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node):
...
@@ -650,7 +667,8 @@ def local_gpualloc_memset_0(node):
inp
.
data
.
size
==
1
and
inp
.
data
.
size
==
1
and
(
np
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
(
np
.
asarray
(
inp
.
data
)
==
0
)
.
all
()):
new_op
=
GpuAlloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
new_op
=
GpuAlloc
(
node
.
op
.
context_name
,
memset_0
=
True
)
return
[
new_op
(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
new_op
(
*
node
.
inputs
,
return_list
=
True
)
# Don't register by default.
# Don't register by default.
...
@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node):
...
@@ -659,10 +677,9 @@ def local_gpua_alloc_empty_to_zeros(node):
if
isinstance
(
node
.
op
,
GpuAllocEmpty
):
if
isinstance
(
node
.
op
,
GpuAllocEmpty
):
context_name
=
infer_context_name
(
*
node
.
inputs
)
context_name
=
infer_context_name
(
*
node
.
inputs
)
z
=
np
.
asarray
(
0
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)
z
=
np
.
asarray
(
0
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)
return
[
GpuAlloc
(
context_name
)(
as_gpuarray_variable
(
z
,
context_name
),
with
inherit_stack_trace
(
node
.
outputs
):
*
node
.
inputs
)]
return
[
GpuAlloc
(
context_name
)(
as_gpuarray_variable
(
z
,
context_name
),
*
node
.
inputs
)]
optdb
.
register
(
'local_gpua_alloc_empty_to_zeros'
,
optdb
.
register
(
'local_gpua_alloc_empty_to_zeros'
,
theano
.
tensor
.
opt
.
in2out
(
local_gpua_alloc_empty_to_zeros
),
theano
.
tensor
.
opt
.
in2out
(
local_gpua_alloc_empty_to_zeros
),
# After move to gpu and merge2, before inplace.
# After move to gpu and merge2, before inplace.
...
@@ -1206,7 +1223,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
...
@@ -1206,7 +1223,8 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
op
.
scalar_op
,
axis
=
op
.
axis
,
op
.
scalar_op
,
axis
=
op
.
axis
,
dtype
=
odtype
,
dtype
=
odtype
,
acc_dtype
=
adtype
)
acc_dtype
=
adtype
)
gvar
=
greduce
(
x
)
with
inherit_stack_trace
(
outputs
):
gvar
=
greduce
(
x
)
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
if
(
op2
is
GpuCAReduceCPY
or
if
(
op2
is
GpuCAReduceCPY
or
...
@@ -1246,25 +1264,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
...
@@ -1246,25 +1264,27 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
dtype
=
getattr
(
op
,
'dtype'
,
outputs
[
0
]
.
dtype
),
dtype
=
getattr
(
op
,
'dtype'
,
outputs
[
0
]
.
dtype
),
acc_dtype
=
getattr
(
op
,
'acc_dtype'
,
None
))
acc_dtype
=
getattr
(
op
,
'acc_dtype'
,
None
))
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
with
inherit_stack_trace
(
outputs
):
gpu_reshaped_x
=
as_gpuarray_variable
(
reshaped_x
,
context_name
)
reshaped_x
=
x
.
reshape
(
tensor
.
stack
(
new_in_shp
))
gvar
=
greduce
(
gpu_reshaped_x
)
gpu_reshaped_x
=
as_gpuarray_variable
(
reshaped_x
,
context_name
)
# We need to have the make node called, otherwise the mask can
# We need to have the make node called, otherwise the mask can
# be None
# be None
reshaped_gpu_inputs
=
[
gpu_reshaped_x
]
gvar
=
greduce
(
gpu_reshaped_x
)
if
greduce
.
supports_c_code
(
reshaped_gpu_inputs
):
reshaped_gpu_inputs
=
[
gpu_reshaped_x
]
reduce_reshaped_x
=
greduce
(
gpu_reshaped_x
)
if
greduce
.
supports_c_code
(
reshaped_gpu_inputs
):
reduce_reshaped_x
=
greduce
(
gpu_reshaped_x
)
if
reduce_reshaped_x
.
ndim
!=
outputs
[
0
]
.
ndim
:
out_shp
=
[]
if
reduce_reshaped_x
.
ndim
!=
outputs
[
0
]
.
ndim
:
for
i
in
range
(
x
.
ndim
):
out_shp
=
[]
if
i
not
in
op
.
axis
:
for
i
in
range
(
x
.
ndim
):
out_shp
.
append
(
shape_i
(
x
,
i
))
if
i
not
in
op
.
axis
:
unreshaped_reduce
=
GpuReshape
(
len
(
out_shp
))(
reduce_reshaped_x
,
out_shp
.
append
(
shape_i
(
x
,
i
))
tensor
.
stack
(
out_shp
))
unreshaped_reduce
=
GpuReshape
(
len
(
out_shp
))(
else
:
reduce_reshaped_x
,
unreshaped_reduce
=
reduce_reshaped_x
tensor
.
stack
(
out_shp
))
return
[
unreshaped_reduce
]
else
:
unreshaped_reduce
=
reduce_reshaped_x
return
[
unreshaped_reduce
]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
...
@@ -1305,33 +1325,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
...
@@ -1305,33 +1325,34 @@ def local_gpua_gemm(op, context_name, inputs, outputs):
def
local_gpua_gemmbatch
(
op
,
context_name
,
inputs
,
outputs
):
def
local_gpua_gemmbatch
(
op
,
context_name
,
inputs
,
outputs
):
if
inputs
[
0
]
.
dtype
not
in
[
'float16'
,
'float32'
,
'float64'
]:
if
inputs
[
0
]
.
dtype
not
in
[
'float16'
,
'float32'
,
'float64'
]:
return
return
a
,
b
=
inputs
with
inherit_stack_trace
(
outputs
):
# Since GpuGemmBatch only supports 3D inputs and output,
a
,
b
=
inputs
# we need to add broadcastable dims to the inputs, and drop
# Since GpuGemmBatch only supports 3D inputs and output,
# them from outputs
# we need to add broadcastable dims to the inputs, and drop
output_dims
=
[
0
,
1
,
2
]
# them from outputs
if
a
.
ndim
==
2
:
output_dims
=
[
0
,
1
,
2
]
a
=
GpuDimShuffle
(
a
.
broadcastable
,
(
0
,
'x'
,
1
))(
a
)
if
a
.
ndim
==
2
:
del
output_dims
[
1
]
a
=
GpuDimShuffle
(
a
.
broadcastable
,
(
0
,
'x'
,
1
))(
a
)
if
b
.
ndim
==
2
:
del
output_dims
[
1
]
b
=
GpuDimShuffle
(
b
.
broadcastable
,
(
0
,
1
,
'x'
))(
b
)
if
b
.
ndim
==
2
:
del
output_dims
[
-
1
]
b
=
GpuDimShuffle
(
b
.
broadcastable
,
(
0
,
1
,
'x'
))(
b
)
# In case of mismatched dtypes, we also have to upcast
del
output_dims
[
-
1
]
out_dtype
=
outputs
[
0
]
.
dtype
# In case of mismatched dtypes, we also have to upcast
if
a
.
dtype
!=
out_dtype
or
b
.
dtype
!=
out_dtype
:
out_dtype
=
outputs
[
0
]
.
dtype
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
if
a
.
dtype
!=
out_dtype
or
b
.
dtype
!=
out_dtype
:
if
a
.
dtype
!=
out_dtype
:
gpu_cast_op
=
GpuElemwise
(
Cast
(
Scalar
(
out_dtype
)))
a
=
gpu_cast_op
(
a
)
if
a
.
dtype
!=
out_dtype
:
if
b
.
dtype
!=
out_dtype
:
a
=
gpu_cast_op
(
a
)
b
=
gpu_cast_op
(
b
)
if
b
.
dtype
!=
out_dtype
:
b
=
gpu_cast_op
(
b
)
c
=
GpuAllocEmpty
(
out_dtype
,
context_name
)(
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
c
=
GpuAllocEmpty
(
out_dtype
,
context_name
)(
out
=
gpugemmbatch_no_inplace
(
c
,
np
.
asarray
(
1.0
,
dtype
=
out_dtype
),
a
.
shape
[
0
],
a
.
shape
[
1
],
b
.
shape
[
2
])
a
,
b
,
np
.
asarray
(
0.0
,
dtype
=
out_dtype
))
out
=
gpugemmbatch_no_inplace
(
c
,
np
.
asarray
(
1.0
,
dtype
=
out_dtype
),
if
len
(
output_dims
)
!=
3
:
a
,
b
,
np
.
asarray
(
0.0
,
dtype
=
out_dtype
))
out
=
GpuDimShuffle
(
out
.
broadcastable
,
output_dims
)(
out
)
if
len
(
output_dims
)
!=
3
:
return
out
out
=
GpuDimShuffle
(
out
.
broadcastable
,
output_dims
)(
out
)
return
out
@register_opt
()
@register_opt
()
...
@@ -1378,11 +1399,12 @@ def local_gpua_dot22(op, context_name, inputs, outputs):
...
@@ -1378,11 +1399,12 @@ def local_gpua_dot22(op, context_name, inputs, outputs):
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
@op_lifter
([
tensor
.
blas
.
Dot22Scalar
])
@register_opt2
([
tensor
.
blas
.
Dot22Scalar
],
'fast_compile'
)
@register_opt2
([
tensor
.
blas
.
Dot22Scalar
],
'fast_compile'
)
def
local_gpua_dot22scalar
(
op
,
context_name
,
inputs
,
outputs
):
def
local_gpua_dot22scalar
(
op
,
context_name
,
inputs
,
outputs
):
x
,
y
,
a
=
inputs
with
inherit_stack_trace
(
outputs
):
x
=
as_gpuarray_variable
(
x
,
context_name
)
x
,
y
,
a
=
inputs
y
=
as_gpuarray_variable
(
y
,
context_name
)
x
=
as_gpuarray_variable
(
x
,
context_name
)
z
=
GpuAllocEmpty
(
x
.
dtype
,
context_name
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
y
=
as_gpuarray_variable
(
y
,
context_name
)
return
[
gpugemm_no_inplace
(
z
,
a
,
x
,
y
,
0
)]
z
=
GpuAllocEmpty
(
x
.
dtype
,
context_name
)(
x
.
shape
[
0
],
y
.
shape
[
1
])
return
[
gpugemm_no_inplace
(
z
,
a
,
x
,
y
,
0
)]
@register_opt
(
'fast_compile'
)
@register_opt
(
'fast_compile'
)
...
@@ -2392,7 +2414,8 @@ def local_gpu_elemwise_careduce(node):
...
@@ -2392,7 +2414,8 @@ def local_gpu_elemwise_careduce(node):
props
=
node
.
op
.
_props_dict
()
props
=
node
.
op
.
_props_dict
()
props
[
"pre_scalar_op"
]
=
scalar
.
basic
.
sqr
props
[
"pre_scalar_op"
]
=
scalar
.
basic
.
sqr
out
=
GpuCAReduceCuda
(
**
props
)(
inp
)
out
=
GpuCAReduceCuda
(
**
props
)(
inp
)
return
[
out
]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
out
]
@local_optimizer
(
None
)
@local_optimizer
(
None
)
...
@@ -2583,8 +2606,9 @@ def local_gpu_solve(op, context_name, inputs, outputs):
...
@@ -2583,8 +2606,9 @@ def local_gpu_solve(op, context_name, inputs, outputs):
@local_optimizer
([
GpuCusolverSolve
],
inplace
=
True
)
@local_optimizer
([
GpuCusolverSolve
],
inplace
=
True
)
def
local_inplace_gpu_solve
(
node
):
def
local_inplace_gpu_solve
(
node
):
if
isinstance
(
node
.
op
,
GpuCusolverSolve
)
and
not
node
.
op
.
inplace
:
if
isinstance
(
node
.
op
,
GpuCusolverSolve
)
and
not
node
.
op
.
inplace
:
return
[
GpuCusolverSolve
(
A_structure
=
node
.
op
.
A_structure
,
trans
=
node
.
op
.
trans
,
with
inherit_stack_trace
(
node
.
outputs
):
inplace
=
True
)(
*
node
.
inputs
)]
return
[
GpuCusolverSolve
(
A_structure
=
node
.
op
.
A_structure
,
trans
=
node
.
op
.
trans
,
inplace
=
True
)(
*
node
.
inputs
)]
# Cholesky decomposition
# Cholesky decomposition
...
@@ -2622,7 +2646,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
...
@@ -2622,7 +2646,8 @@ register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops
@local_optimizer
([
GpuCholesky
],
inplace
=
True
)
@local_optimizer
([
GpuCholesky
],
inplace
=
True
)
def
local_inplace_gpu_cholesky
(
node
):
def
local_inplace_gpu_cholesky
(
node
):
if
isinstance
(
node
.
op
,
GpuCholesky
)
and
not
node
.
op
.
inplace
:
if
isinstance
(
node
.
op
,
GpuCholesky
)
and
not
node
.
op
.
inplace
:
return
[
node
.
op
.
clone_inplace
()(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
node
.
op
.
clone_inplace
()(
*
node
.
inputs
)]
def
local_gpu_magma_cholesky
(
op
,
context_name
,
inputs
,
outputs
):
def
local_gpu_magma_cholesky
(
op
,
context_name
,
inputs
,
outputs
):
...
@@ -2705,7 +2730,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
...
@@ -2705,7 +2730,8 @@ def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
@local_optimizer
([
GpuMagmaMatrixInverse
])
@local_optimizer
([
GpuMagmaMatrixInverse
])
def
local_inplace_gpu_magma_matrix_inverse
(
node
):
def
local_inplace_gpu_magma_matrix_inverse
(
node
):
if
isinstance
(
node
.
op
,
GpuMagmaMatrixInverse
)
and
not
node
.
op
.
inplace
:
if
isinstance
(
node
.
op
,
GpuMagmaMatrixInverse
)
and
not
node
.
op
.
inplace
:
return
[
node
.
op
.
clone_inplace
()(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
node
.
op
.
clone_inplace
()(
*
node
.
inputs
)]
# Eigen decomposition of a symmetric matrix
# Eigen decomposition of a symmetric matrix
...
...
theano/gpuarray/opt_util.py
浏览文件 @
7a6d676f
...
@@ -5,6 +5,7 @@ import numpy as np
...
@@ -5,6 +5,7 @@ import numpy as np
from
theano
import
tensor
,
scalar
as
scal
,
Constant
from
theano
import
tensor
,
scalar
as
scal
,
Constant
from
theano.gof
import
local_optimizer
from
theano.gof
import
local_optimizer
from
theano.gof.opt
import
inherit_stack_trace
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
NotScalarConstantError
)
NotScalarConstantError
)
...
@@ -184,7 +185,8 @@ def alpha_merge(cls, alpha_in, beta_in):
...
@@ -184,7 +185,8 @@ def alpha_merge(cls, alpha_in, beta_in):
except
NotScalarConstantError
:
except
NotScalarConstantError
:
inputs
[
alpha_in
]
=
lr
*
targ
.
inputs
[
alpha_in
]
inputs
[
alpha_in
]
=
lr
*
targ
.
inputs
[
alpha_in
]
inputs
[
beta_in
]
=
lr
*
targ
.
inputs
[
beta_in
]
inputs
[
beta_in
]
=
lr
*
targ
.
inputs
[
beta_in
]
return
maker
(
targ
,
*
inputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
maker
(
targ
,
*
inputs
)
return
opt
return
opt
return
wrapper
return
wrapper
...
@@ -272,7 +274,8 @@ def output_merge(cls, alpha_in, beta_in, out_in):
...
@@ -272,7 +274,8 @@ def output_merge(cls, alpha_in, beta_in, out_in):
inputs
=
list
(
targ
.
inputs
)
inputs
=
list
(
targ
.
inputs
)
inputs
[
out_in
]
=
W
inputs
[
out_in
]
=
W
inputs
[
beta_in
]
=
_one
.
clone
()
inputs
[
beta_in
]
=
_one
.
clone
()
return
maker
(
targ
,
*
inputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
maker
(
targ
,
*
inputs
)
return
opt
return
opt
return
wrapper
return
wrapper
...
@@ -326,7 +329,8 @@ def inplace_allocempty(op, idx):
...
@@ -326,7 +329,8 @@ def inplace_allocempty(op, idx):
len
(
alloc
.
clients
)
>
1
):
len
(
alloc
.
clients
)
>
1
):
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
,
alloc
.
owner
.
op
.
context_name
)
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
,
alloc
.
owner
.
op
.
context_name
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
return
maker
(
node
,
inputs
)
with
inherit_stack_trace
(
node
.
outputs
):
return
maker
(
node
,
inputs
)
return
opt
return
opt
return
wrapper
return
wrapper
...
...
theano/gpuarray/tests/test_opt.py
浏览文件 @
7a6d676f
...
@@ -8,12 +8,13 @@ import theano.tensor.slinalg as slinalg
...
@@ -8,12 +8,13 @@ import theano.tensor.slinalg as slinalg
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests.breakpoint
import
PdbBreakpoint
from
theano.tests
import
unittest_tools
as
utt
,
test_ifelse
from
theano.tests
import
unittest_tools
as
utt
,
test_ifelse
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests
import
test_basic
from
theano.gof.opt
import
check_stack_trace
import
theano.gpuarray
import
theano.gpuarray
from
..
import
basic_ops
from
..
import
basic_ops
from
..type
import
GpuArrayType
,
gpuarray_shared_constructor
,
get_context
from
..type
import
GpuArrayType
,
gpuarray_shared_constructor
,
get_context
from
..basic_ops
import
(
from
..basic_ops
import
(
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuFromHost
,
host_from_gpu
)
GpuAlloc
,
GpuAllocEmpty
,
GpuReshape
,
GpuFromHost
,
HostFromGpu
,
host_from_gpu
)
from
..blas
import
GpuGemm
from
..blas
import
GpuGemm
from
..elemwise
import
(
from
..elemwise
import
(
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
,
Elemwise
,
max_inputs_to_GpuElemwise
)
GpuCAReduceCuda
,
GpuCAReduceCPY
,
GpuElemwise
,
Elemwise
,
max_inputs_to_GpuElemwise
)
...
@@ -27,6 +28,28 @@ from theano.tensor.nnet import abstract_conv
...
@@ -27,6 +28,28 @@ from theano.tensor.nnet import abstract_conv
from
theano.gpuarray
import
dnn
,
blas
from
theano.gpuarray
import
dnn
,
blas
def
_check_stack_trace
(
thing
):
def
_ops_to_check
(
op
):
if
not
isinstance
(
op
,
theano
.
gof
.
Op
):
op
=
op
.
op
# assume it is an apply node
return
not
isinstance
(
op
,
(
theano
.
compile
.
ops
.
Shape_i
,
theano
.
compile
.
ops
.
Shape
,
theano
.
compile
.
ops
.
DeepCopyOp
,
theano
.
tensor
.
opt
.
MakeVector
,
theano
.
tensor
.
subtensor
.
Subtensor
,
theano
.
tensor
.
elemwise
.
Elemwise
,
theano
.
ifelse
.
IfElse
,
GpuFromHost
,
HostFromGpu
,
GpuCAReduceCuda
,
basic_ops
.
GpuContiguous
,
GpuElemwise
,
theano
.
printing
.
Print
,
PdbBreakpoint
,
))
return
check_stack_trace
(
thing
,
ops_to_check
=
_ops_to_check
,
bug_print
=
"ignore"
)
def
test_local_assert
():
def
test_local_assert
():
x
=
theano
.
tensor
.
fmatrix
()
x
=
theano
.
tensor
.
fmatrix
()
a
=
theano
.
tensor
.
opt
.
assert_op
(
x
,
theano
.
tensor
.
eq
(
x
,
0
)
.
any
())
a
=
theano
.
tensor
.
opt
.
assert_op
(
x
,
theano
.
tensor
.
eq
(
x
,
0
)
.
any
())
...
@@ -70,6 +93,8 @@ def test_local_gpu_contiguous_gpu_contiguous():
...
@@ -70,6 +93,8 @@ def test_local_gpu_contiguous_gpu_contiguous():
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
assert
1
==
len
([
node
for
node
in
f2
.
maker
.
fgraph
.
toposort
()
assert
1
==
len
([
node
for
node
in
f2
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
assert
_check_stack_trace
(
f1
)
assert
_check_stack_trace
(
f2
)
def
test_local_gpu_contiguous
():
def
test_local_gpu_contiguous
():
...
@@ -79,6 +104,7 @@ def test_local_gpu_contiguous():
...
@@ -79,6 +104,7 @@ def test_local_gpu_contiguous():
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
toposort
()
assert
1
==
len
([
node
for
node
in
f
.
maker
.
fgraph
.
toposort
()
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
if
isinstance
(
node
.
op
,
basic_ops
.
GpuContiguous
)])
f
([[
2.
]])
f
([[
2.
]])
assert
_check_stack_trace
(
f
)
def
test_flatten
():
def
test_flatten
():
...
@@ -96,6 +122,7 @@ def test_flatten():
...
@@ -96,6 +122,7 @@ def test_flatten():
assert
res
.
shape
==
val
.
flatten
()
.
shape
assert
res
.
shape
==
val
.
flatten
()
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
f
=
theano
.
function
([
m
],
m
.
flatten
(
ndim
=
2
),
f
=
theano
.
function
([
m
],
m
.
flatten
(
ndim
=
2
),
mode
=
mode_with_gpu
.
excluding
(
"local_useless_reshape"
))
mode
=
mode_with_gpu
.
excluding
(
"local_useless_reshape"
))
...
@@ -105,6 +132,7 @@ def test_flatten():
...
@@ -105,6 +132,7 @@ def test_flatten():
assert
res
.
shape
==
val
.
shape
assert
res
.
shape
==
val
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
m
=
theano
.
tensor
.
tensor3
()
m
=
theano
.
tensor
.
tensor3
()
f
=
theano
.
function
([
m
],
m
.
flatten
(
ndim
=
2
),
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
m
],
m
.
flatten
(
ndim
=
2
),
mode
=
mode_with_gpu
)
...
@@ -114,6 +142,7 @@ def test_flatten():
...
@@ -114,6 +142,7 @@ def test_flatten():
assert
res
.
shape
==
val
.
reshape
(
10
,
-
1
)
.
shape
assert
res
.
shape
==
val
.
reshape
(
10
,
-
1
)
.
shape
assert
GpuReshape
in
[
type
(
node
.
op
)
assert
GpuReshape
in
[
type
(
node
.
op
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
for
node
in
f
.
maker
.
fgraph
.
toposort
()]
assert
_check_stack_trace
(
f
)
def
test_reduce
():
def
test_reduce
():
...
@@ -126,6 +155,9 @@ def test_reduce():
...
@@ -126,6 +155,9 @@ def test_reduce():
f
=
theano
.
function
([
m
],
getattr
(
m
,
method
)(
axis
=
0
,
f
=
theano
.
function
([
m
],
getattr
(
m
,
method
)(
axis
=
0
,
**
param
),
**
param
),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
# assert _check_stack_trace(f) this op is ok but since
# it is using GpuCAReduceCuda that has an empty stack
# trace, this assertion gives error.
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
val
=
np
.
random
.
rand
(
10
,
11
)
.
astype
(
"float32"
)
res
=
f
(
val
)
res
=
f
(
val
)
utt
.
assert_allclose
(
res
,
getattr
(
val
,
method
)(
axis
=
0
))
utt
.
assert_allclose
(
res
,
getattr
(
val
,
method
)(
axis
=
0
))
...
@@ -157,6 +189,7 @@ def test_local_gpualloc_memset_0():
...
@@ -157,6 +189,7 @@ def test_local_gpualloc_memset_0():
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
Alloc
)
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
Alloc
)
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 0 from CPU op.
# Test with 0 from CPU op.
# Should be transfered as it is used by another op.
# Should be transfered as it is used by another op.
...
@@ -166,6 +199,7 @@ def test_local_gpualloc_memset_0():
...
@@ -166,6 +199,7 @@ def test_local_gpualloc_memset_0():
assert
len
(
topo
)
==
3
assert
len
(
topo
)
==
3
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 0
# Test with 0
a
=
GpuAlloc
(
test_ctx_name
)(
z
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)(
z
,
i
)
...
@@ -174,6 +208,7 @@ def test_local_gpualloc_memset_0():
...
@@ -174,6 +208,7 @@ def test_local_gpualloc_memset_0():
assert
len
(
topo
)
==
1
assert
len
(
topo
)
==
1
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
and
topo
[
0
]
.
op
.
memset_0
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
and
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
(
np
.
asarray
(
f
(
6
))
==
0
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 1
# Test with 1
a
=
GpuAlloc
(
test_ctx_name
)(
o
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)(
o
,
i
)
...
@@ -183,6 +218,7 @@ def test_local_gpualloc_memset_0():
...
@@ -183,6 +218,7 @@ def test_local_gpualloc_memset_0():
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
6
))
==
1
)
.
all
()
assert
(
np
.
asarray
(
f
(
6
))
==
1
)
.
all
()
assert
_check_stack_trace
(
f
)
# Test with 1, 1
# Test with 1, 1
a
=
GpuAlloc
(
test_ctx_name
)(
ones
,
i
)
a
=
GpuAlloc
(
test_ctx_name
)(
ones
,
i
)
...
@@ -192,6 +228,7 @@ def test_local_gpualloc_memset_0():
...
@@ -192,6 +228,7 @@ def test_local_gpualloc_memset_0():
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAlloc
)
assert
not
topo
[
0
]
.
op
.
memset_0
assert
not
topo
[
0
]
.
op
.
memset_0
assert
(
np
.
asarray
(
f
(
2
))
==
1
)
.
all
()
assert
(
np
.
asarray
(
f
(
2
))
==
1
)
.
all
()
assert
_check_stack_trace
(
f
)
def
test_local_gpualloc_empty
():
def
test_local_gpualloc_empty
():
...
@@ -207,6 +244,7 @@ def test_local_gpualloc_empty():
...
@@ -207,6 +244,7 @@ def test_local_gpualloc_empty():
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
AllocEmpty
)
assert
isinstance
(
topo
[
0
]
.
op
,
theano
.
tensor
.
AllocEmpty
)
# This return not initilized data, so we can only check the shape
# This return not initilized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
assert
f
(
3
)
.
shape
==
(
3
,)
assert
_check_stack_trace
(
f
)
# Test with vector
# Test with vector
# Should be moved
# Should be moved
...
@@ -217,6 +255,7 @@ def test_local_gpualloc_empty():
...
@@ -217,6 +255,7 @@ def test_local_gpualloc_empty():
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initilized data, so we can only check the shape
# This return not initilized data, so we can only check the shape
assert
f
(
3
)
.
shape
==
(
3
,)
assert
f
(
3
)
.
shape
==
(
3
,)
assert
_check_stack_trace
(
f
)
# Test with matrix
# Test with matrix
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
,
ii
)
a
=
tensor
.
AllocEmpty
(
'float32'
)(
i
,
ii
)
...
@@ -226,6 +265,7 @@ def test_local_gpualloc_empty():
...
@@ -226,6 +265,7 @@ def test_local_gpualloc_empty():
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
assert
isinstance
(
topo
[
0
]
.
op
,
GpuAllocEmpty
)
# This return not initilized data, so we can only check the shape
# This return not initilized data, so we can only check the shape
assert
f
(
3
,
4
)
.
shape
==
(
3
,
4
)
assert
f
(
3
,
4
)
.
shape
==
(
3
,
4
)
assert
_check_stack_trace
(
f
)
def
test_rebroadcast
():
def
test_rebroadcast
():
...
@@ -243,6 +283,7 @@ def test_rebroadcast():
...
@@ -243,6 +283,7 @@ def test_rebroadcast():
assert
isinstance
(
rebr
.
inputs
[
0
]
.
type
,
GpuArrayType
)
assert
isinstance
(
rebr
.
inputs
[
0
]
.
type
,
GpuArrayType
)
assert
isinstance
(
rebr
.
outputs
[
0
]
.
type
,
GpuArrayType
)
assert
isinstance
(
rebr
.
outputs
[
0
]
.
type
,
GpuArrayType
)
assert
_check_stack_trace
(
f
)
class
TestSpecifyShape
(
test_basic
.
TestSpecifyShape
):
class
TestSpecifyShape
(
test_basic
.
TestSpecifyShape
):
...
@@ -268,6 +309,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
...
@@ -268,6 +309,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
theano
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
theano
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
_check_stack_trace
(
f
)
x
=
tensor
.
vector
()
x
=
tensor
.
vector
()
cond
=
tensor
.
scalar
()
cond
=
tensor
.
scalar
()
...
@@ -275,6 +317,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
...
@@ -275,6 +317,7 @@ class test_gpu_ifelse(test_ifelse.test_ifelse):
theano
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
theano
.
ifelse
.
ifelse
(
cond
,
x
.
mean
(),
x
.
sum
()),
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
f
(
np
.
float32
([
1
,
2
,
3
]),
0
)
==
6
assert
_check_stack_trace
(
f
)
def
test_lifter_with_shared_var
(
self
):
def
test_lifter_with_shared_var
(
self
):
x
=
tensor
.
lscalar
(
'x'
)
x
=
tensor
.
lscalar
(
'x'
)
...
@@ -297,6 +340,7 @@ def test_print_op():
...
@@ -297,6 +340,7 @@ def test_print_op():
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
assert
isinstance
(
topo
[
1
]
.
op
,
theano
.
printing
.
Print
)
assert
isinstance
(
topo
[
2
]
.
op
,
GpuElemwise
)
assert
isinstance
(
topo
[
2
]
.
op
,
GpuElemwise
)
assert
topo
[
3
]
.
op
==
host_from_gpu
assert
topo
[
3
]
.
op
==
host_from_gpu
assert
_check_stack_trace
(
f
)
f
(
np
.
random
.
random
((
5
,
5
))
.
astype
(
'float32'
))
f
(
np
.
random
.
random
((
5
,
5
))
.
astype
(
'float32'
))
...
@@ -317,6 +361,7 @@ def test_pdbbreakpoint_op():
...
@@ -317,6 +361,7 @@ def test_pdbbreakpoint_op():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
isinstance
(
topo
[
-
2
]
.
op
,
GpuElemwise
)
assert
isinstance
(
topo
[
-
2
]
.
op
,
GpuElemwise
)
assert
topo
[
-
1
]
.
op
==
host_from_gpu
assert
topo
[
-
1
]
.
op
==
host_from_gpu
assert
_check_stack_trace
(
f
)
def
test_local_gpu_elemwise_careduce
():
def
test_local_gpu_elemwise_careduce
():
...
@@ -326,6 +371,7 @@ def test_local_gpu_elemwise_careduce():
...
@@ -326,6 +371,7 @@ def test_local_gpu_elemwise_careduce():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
len
(
topo
)
==
3
assert
topo
[
1
]
.
op
.
pre_scalar_op
==
theano
.
scalar
.
sqr
assert
topo
[
1
]
.
op
.
pre_scalar_op
==
theano
.
scalar
.
sqr
assert
_check_stack_trace
(
f
)
data
=
np
.
random
.
rand
(
3
,
4
)
.
astype
(
theano
.
config
.
floatX
)
data
=
np
.
random
.
rand
(
3
,
4
)
.
astype
(
theano
.
config
.
floatX
)
utt
.
assert_allclose
(
f
(
data
),
(
data
*
data
)
.
sum
())
utt
.
assert_allclose
(
f
(
data
),
(
data
*
data
)
.
sum
())
...
@@ -334,6 +380,7 @@ def test_local_gpu_elemwise_careduce():
...
@@ -334,6 +380,7 @@ def test_local_gpu_elemwise_careduce():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
len
(
topo
)
==
3
assert
len
(
topo
)
==
3
assert
topo
[
1
]
.
op
.
pre_scalar_op
==
theano
.
scalar
.
sqr
assert
topo
[
1
]
.
op
.
pre_scalar_op
==
theano
.
scalar
.
sqr
assert
_check_stack_trace
(
f
)
utt
.
assert_allclose
(
f
(
data
),
(
data
*
data
)
.
sum
(
axis
=
1
))
utt
.
assert_allclose
(
f
(
data
),
(
data
*
data
)
.
sum
(
axis
=
1
))
...
@@ -352,6 +399,7 @@ def test_local_lift_dot22scalar():
...
@@ -352,6 +399,7 @@ def test_local_lift_dot22scalar():
y_val
=
np
.
random
.
random
((
3
,
4
))
.
astype
(
theano
.
config
.
floatX
)
y_val
=
np
.
random
.
random
((
3
,
4
))
.
astype
(
theano
.
config
.
floatX
)
a_val
=
0.5
a_val
=
0.5
utt
.
assert_allclose
(
f_cpu
(
x_val
,
y_val
,
a_val
),
f_gpu
(
x_val
,
y_val
,
a_val
))
utt
.
assert_allclose
(
f_cpu
(
x_val
,
y_val
,
a_val
),
f_gpu
(
x_val
,
y_val
,
a_val
))
assert
_check_stack_trace
(
f_gpu
)
def
test_local_gpu_subtensor
():
def
test_local_gpu_subtensor
():
...
@@ -361,6 +409,7 @@ def test_local_gpu_subtensor():
...
@@ -361,6 +409,7 @@ def test_local_gpu_subtensor():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
_check_stack_trace
(
f
)
# Test graph input.
# Test graph input.
t
=
tensor
.
fmatrix
()
t
=
tensor
.
fmatrix
()
...
@@ -368,6 +417,7 @@ def test_local_gpu_subtensor():
...
@@ -368,6 +417,7 @@ def test_local_gpu_subtensor():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
not
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
_check_stack_trace
(
f
)
# Test multiple use of the input
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
...
@@ -376,6 +426,7 @@ def test_local_gpu_subtensor():
...
@@ -376,6 +426,7 @@ def test_local_gpu_subtensor():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
_check_stack_trace
(
f
)
# Test multiple use of the input + input as output
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
# We want the subtensor to be on the GPU to prevent multiple transfer.
...
@@ -384,6 +435,7 @@ def test_local_gpu_subtensor():
...
@@ -384,6 +435,7 @@ def test_local_gpu_subtensor():
topo
=
f
.
maker
.
fgraph
.
toposort
()
topo
=
f
.
maker
.
fgraph
.
toposort
()
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
not
any
([
type
(
node
.
op
)
is
tensor
.
Subtensor
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
GpuSubtensor
)
for
node
in
topo
])
assert
_check_stack_trace
(
f
)
# Test shared forced on CPU end we do computation on the output of
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
# the subtensor.
...
@@ -396,6 +448,7 @@ def test_local_gpu_subtensor():
...
@@ -396,6 +448,7 @@ def test_local_gpu_subtensor():
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert
any
([
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
for
node
in
topo
])
assert
any
([
isinstance
(
node
.
op
,
tensor
.
Elemwise
)
for
node
in
topo
])
assert
_check_stack_trace
(
f
)
def
test_local_gpu_elemwise
():
def
test_local_gpu_elemwise
():
...
@@ -417,6 +470,7 @@ def test_local_gpu_elemwise():
...
@@ -417,6 +470,7 @@ def test_local_gpu_elemwise():
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
tensor
.
Elemwise
for
node
in
topo
)
==
0
assert
sum
(
type
(
node
.
op
)
==
tensor
.
Elemwise
for
node
in
topo
)
==
0
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
assert
_check_stack_trace
(
f
)
# Now test with the composite already on the cpu before we move it
# Now test with the composite already on the cpu before we move it
# to the gpu
# to the gpu
...
@@ -430,6 +484,7 @@ def test_local_gpu_elemwise():
...
@@ -430,6 +484,7 @@ def test_local_gpu_elemwise():
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
isinstance
(
node
.
op
,
GpuElemwise
)
for
node
in
topo
)
==
1
assert
sum
(
type
(
node
.
op
)
==
tensor
.
Elemwise
for
node
in
topo
)
==
0
assert
sum
(
type
(
node
.
op
)
==
tensor
.
Elemwise
for
node
in
topo
)
==
0
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
utt
.
assert_allclose
(
f
(
a_v
,
b_v
,
c_v
),
a_v
+
b_v
+
c_v
)
assert
_check_stack_trace
(
f
)
return
# Not yet implemeted
return
# Not yet implemeted
# Test multiple output
# Test multiple output
...
@@ -447,6 +502,7 @@ def test_local_gpu_elemwise():
...
@@ -447,6 +502,7 @@ def test_local_gpu_elemwise():
utt
.
assert_allclose
(
out
[
0
],
a_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
)
utt
.
assert_allclose
(
out
[
1
],
c_v
)
utt
.
assert_allclose
(
out
[
1
],
c_v
)
utt
.
assert_allclose
(
out
[
2
],
b_v
)
utt
.
assert_allclose
(
out
[
2
],
b_v
)
assert
_check_stack_trace
(
f
)
# Test multiple output
# Test multiple output
out_s
=
theano
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
[
a_s
+
b_s
,
a_s
*
b_s
])
out_s
=
theano
.
scalar
.
Composite
([
a_s
,
b_s
,
c_s
],
[
a_s
+
b_s
,
a_s
*
b_s
])
...
@@ -458,6 +514,7 @@ def test_local_gpu_elemwise():
...
@@ -458,6 +514,7 @@ def test_local_gpu_elemwise():
out
=
f
(
a_v
,
b_v
,
c_v
)
out
=
f
(
a_v
,
b_v
,
c_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
+
b_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
+
b_v
)
utt
.
assert_allclose
(
out
[
1
],
a_v
*
c_v
)
utt
.
assert_allclose
(
out
[
1
],
a_v
*
c_v
)
assert
_check_stack_trace
(
f
)
# Test non-contiguous input
# Test non-contiguous input
c
=
gpuarray_shared_constructor
(
np
.
asarray
(
c_v
,
dtype
=
'float32'
))
c
=
gpuarray_shared_constructor
(
np
.
asarray
(
c_v
,
dtype
=
'float32'
))
...
@@ -466,6 +523,7 @@ def test_local_gpu_elemwise():
...
@@ -466,6 +523,7 @@ def test_local_gpu_elemwise():
out
=
f
(
a_v
,
b_v
)
out
=
f
(
a_v
,
b_v
)
utt
.
assert_allclose
(
out
[
0
],
a_v
[::
2
]
+
b_v
[::
2
])
utt
.
assert_allclose
(
out
[
0
],
a_v
[::
2
]
+
b_v
[::
2
])
utt
.
assert_allclose
(
out
[
1
],
a_v
[::
2
]
*
c_v
[::
2
])
utt
.
assert_allclose
(
out
[
1
],
a_v
[::
2
]
*
c_v
[::
2
])
assert
_check_stack_trace
(
f
)
def
test_many_arg_elemwise
():
def
test_many_arg_elemwise
():
...
@@ -541,7 +599,8 @@ def test_local_lift_abstractconv_gpu_shape():
...
@@ -541,7 +599,8 @@ def test_local_lift_abstractconv_gpu_shape():
a
=
tensor
.
ftensor4
()
a
=
tensor
.
ftensor4
()
b
=
tensor
.
ftensor4
()
b
=
tensor
.
ftensor4
()
c
=
tensor
.
nnet
.
abstract_conv
.
AbstractConv2d_gradWeights
()(
a
,
b
,
s
)
c
=
tensor
.
nnet
.
abstract_conv
.
AbstractConv2d_gradWeights
()(
a
,
b
,
s
)
theano
.
function
([
s
,
a
,
b
],
c
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
s
,
a
,
b
],
c
,
mode
=
mode_with_gpu
)
assert
_check_stack_trace
(
f
)
finally
:
finally
:
theano
.
config
.
on_opt_error
=
prev
theano
.
config
.
on_opt_error
=
prev
...
@@ -571,7 +630,8 @@ def test_local_assert_no_cpu_op():
...
@@ -571,7 +630,8 @@ def test_local_assert_no_cpu_op():
# If the flag is ignore
# If the flag is ignore
try
:
try
:
theano
.
config
.
assert_no_cpu_op
=
'ignore'
theano
.
config
.
assert_no_cpu_op
=
'ignore'
theano
.
function
([],
out
,
mode
=
mode_local_assert
)
f
=
theano
.
function
([],
out
,
mode
=
mode_local_assert
)
assert
_check_stack_trace
(
f
)
finally
:
finally
:
theano
.
config
.
assert_no_cpu_op
=
old
theano
.
config
.
assert_no_cpu_op
=
old
...
@@ -581,8 +641,9 @@ def test_no_complex():
...
@@ -581,8 +641,9 @@ def test_no_complex():
freq_var
=
tensor
.
fscalar
()
freq_var
=
tensor
.
fscalar
()
signal_var
=
tensor
.
fscalar
()
signal_var
=
tensor
.
fscalar
()
stft_out
=
tensor
.
exp
(
width_var
*
freq_var
)
*
signal_var
stft_out
=
tensor
.
exp
(
width_var
*
freq_var
)
*
signal_var
theano
.
function
([
width_var
,
freq_var
,
signal_var
],
stft_out
,
f
=
theano
.
function
([
width_var
,
freq_var
,
signal_var
],
stft_out
,
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
assert
_check_stack_trace
(
f
)
@utt.assertFailure_fast
@utt.assertFailure_fast
...
@@ -601,6 +662,7 @@ def test_local_lift_solve():
...
@@ -601,6 +662,7 @@ def test_local_lift_solve():
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
A_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
5
))
.
astype
(
"float32"
)
b_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
3
))
.
astype
(
"float32"
)
b_val
=
np
.
random
.
uniform
(
-
0.4
,
0.4
,
(
5
,
3
))
.
astype
(
"float32"
)
utt
.
assert_allclose
(
f_cpu
(
A_val
,
b_val
),
f_gpu
(
A_val
,
b_val
))
utt
.
assert_allclose
(
f_cpu
(
A_val
,
b_val
),
f_gpu
(
A_val
,
b_val
))
assert
_check_stack_trace
(
f_gpu
)
def
test_gpu_solve_not_inplace
():
def
test_gpu_solve_not_inplace
():
...
@@ -665,7 +727,8 @@ def test_local_gpua_advanced_incsubtensor():
...
@@ -665,7 +727,8 @@ def test_local_gpua_advanced_incsubtensor():
w
=
tensor
.
ones_like
(
y
)
w
=
tensor
.
ones_like
(
y
)
w
=
tensor
.
set_subtensor
(
w
[
tensor
.
eq
(
y
,
1.0
)
.
nonzero
()],
100
)
w
=
tensor
.
set_subtensor
(
w
[
tensor
.
eq
(
y
,
1.0
)
.
nonzero
()],
100
)
w
=
tensor
.
set_subtensor
(
w
[
tensor
.
eq
(
y
,
-
1.0
)
.
nonzero
()],
0
)
w
=
tensor
.
set_subtensor
(
w
[
tensor
.
eq
(
y
,
-
1.0
)
.
nonzero
()],
0
)
theano
.
function
([
target
],
w
)
f
=
theano
.
function
([
target
],
w
)
assert
_check_stack_trace
(
f
)
def
test_batched_dot_lifter
():
def
test_batched_dot_lifter
():
...
@@ -690,6 +753,7 @@ def test_batched_dot_lifter():
...
@@ -690,6 +753,7 @@ def test_batched_dot_lifter():
z
=
tensor
.
batched_dot
(
x
,
y
)
z
=
tensor
.
batched_dot
(
x
,
y
)
f
=
theano
.
function
([
x
,
y
],
z
,
mode
=
mode_with_gpu
)
f
=
theano
.
function
([
x
,
y
],
z
,
mode
=
mode_with_gpu
)
f
(
x_val
,
y_val
)
f
(
x_val
,
y_val
)
assert
check_stack_trace
(
f
,
ops_to_check
=
'all'
)
def
test_crossentropycategorical1hot_lifter
():
def
test_crossentropycategorical1hot_lifter
():
...
...
theano/tensor/blas.py
浏览文件 @
7a6d676f
...
@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
...
@@ -146,6 +146,7 @@ from theano.gof import (utils, Op, view_roots,
EquilibriumOptimizer
,
Apply
,
EquilibriumOptimizer
,
Apply
,
ReplacementDidntRemovedError
)
ReplacementDidntRemovedError
)
from
theano.gof.params_type
import
ParamsType
from
theano.gof.params_type
import
ParamsType
from
theano.gof.opt
import
inherit_stack_trace
from
theano.printing
import
pprint
,
FunctionPrinter
,
debugprint
from
theano.printing
import
pprint
,
FunctionPrinter
,
debugprint
from
theano.compile.mode
import
optdb
from
theano.compile.mode
import
optdb
import
theano.scalar
import
theano.scalar
...
@@ -1625,19 +1626,16 @@ def local_dot_to_dot22(node):
...
@@ -1625,19 +1626,16 @@ def local_dot_to_dot22(node):
return
return
if
y
.
type
.
dtype
in
[
'float16'
,
'float32'
,
'float64'
,
'complex64'
,
'complex128'
]:
if
y
.
type
.
dtype
in
[
'float16'
,
'float32'
,
'float64'
,
'complex64'
,
'complex128'
]:
if
x
.
ndim
==
2
and
y
.
ndim
==
2
:
with
inherit_stack_trace
(
node
.
outputs
):
# print "local_dot_to_dot22: MM"
if
x
.
ndim
==
2
and
y
.
ndim
==
2
:
return
[
_dot22
(
*
node
.
inputs
)]
return
[
_dot22
(
*
node
.
inputs
)]
if
x
.
ndim
==
2
and
y
.
ndim
==
1
:
if
x
.
ndim
==
2
and
y
.
ndim
==
1
:
# print "local_dot_to_dot22: MV"
return
[
_dot22
(
x
,
y
.
dimshuffle
(
0
,
'x'
))
.
dimshuffle
(
0
)]
return
[
_dot22
(
x
,
y
.
dimshuffle
(
0
,
'x'
))
.
dimshuffle
(
0
)]
if
x
.
ndim
==
1
and
y
.
ndim
==
2
:
if
x
.
ndim
==
1
and
y
.
ndim
==
2
:
return
[
_dot22
(
x
.
dimshuffle
(
'x'
,
0
),
y
)
.
dimshuffle
(
1
)]
# print "local_dot_to_dot22: VM"
if
x
.
ndim
==
1
and
y
.
ndim
==
1
:
return
[
_dot22
(
x
.
dimshuffle
(
'x'
,
0
),
y
)
.
dimshuffle
(
1
)]
return
[
_dot22
(
x
.
dimshuffle
(
'x'
,
0
),
if
x
.
ndim
==
1
and
y
.
ndim
==
1
:
y
.
dimshuffle
(
0
,
'x'
))
.
dimshuffle
()]
# print "local_dot_to_dot22: VV"
return
[
_dot22
(
x
.
dimshuffle
(
'x'
,
0
),
y
.
dimshuffle
(
0
,
'x'
))
.
dimshuffle
()]
_logger
.
info
(
'Not optimizing dot with inputs
%
s
%
s
%
s
%
s'
,
_logger
.
info
(
'Not optimizing dot with inputs
%
s
%
s
%
s
%
s'
,
x
,
y
,
x
.
type
,
y
.
type
)
x
,
y
,
x
.
type
,
y
.
type
)
...
@@ -1646,19 +1644,22 @@ def local_dot_to_dot22(node):
...
@@ -1646,19 +1644,22 @@ def local_dot_to_dot22(node):
@local_optimizer
([
gemm_no_inplace
],
inplace
=
True
)
@local_optimizer
([
gemm_no_inplace
],
inplace
=
True
)
def
local_inplace_gemm
(
node
):
def
local_inplace_gemm
(
node
):
if
node
.
op
==
gemm_no_inplace
:
if
node
.
op
==
gemm_no_inplace
:
return
[
gemm_inplace
(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
gemm_inplace
(
*
node
.
inputs
)]
@local_optimizer
([
gemv_no_inplace
],
inplace
=
True
)
@local_optimizer
([
gemv_no_inplace
],
inplace
=
True
)
def
local_inplace_gemv
(
node
):
def
local_inplace_gemv
(
node
):
if
node
.
op
==
gemv_no_inplace
:
if
node
.
op
==
gemv_no_inplace
:
return
[
gemv_inplace
(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
gemv_inplace
(
*
node
.
inputs
)]
@local_optimizer
([
ger
],
inplace
=
True
)
@local_optimizer
([
ger
],
inplace
=
True
)
def
local_inplace_ger
(
node
):
def
local_inplace_ger
(
node
):
if
node
.
op
==
ger
:
if
node
.
op
==
ger
:
return
[
ger_destructive
(
*
node
.
inputs
)]
with
inherit_stack_trace
(
node
.
outputs
):
return
[
ger_destructive
(
*
node
.
inputs
)]
@local_optimizer
([
gemm_no_inplace
])
@local_optimizer
([
gemm_no_inplace
])
...
@@ -1666,12 +1667,13 @@ def local_gemm_to_gemv(node):
...
@@ -1666,12 +1667,13 @@ def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV."""
"""GEMM acting on row or column matrices -> GEMV."""
if
node
.
op
==
gemm_no_inplace
:
if
node
.
op
==
gemm_no_inplace
:
z
,
a
,
x
,
y
,
b
=
node
.
inputs
z
,
a
,
x
,
y
,
b
=
node
.
inputs
if
z
.
broadcastable
==
x
.
broadcastable
==
(
True
,
False
):
with
inherit_stack_trace
(
node
.
outputs
):
r
=
gemv_no_inplace
(
z
.
dimshuffle
(
1
),
a
,
y
.
T
,
x
.
dimshuffle
(
1
),
b
)
if
z
.
broadcastable
==
x
.
broadcastable
==
(
True
,
False
):
return
[
r
.
dimshuffle
(
'x'
,
0
)]
r
=
gemv_no_inplace
(
z
.
dimshuffle
(
1
),
a
,
y
.
T
,
x
.
dimshuffle
(
1
),
b
)
if
z
.
broadcastable
==
y
.
broadcastable
==
(
False
,
True
):
return
[
r
.
dimshuffle
(
'x'
,
0
)]
r
=
gemv_no_inplace
(
z
.
dimshuffle
(
0
),
a
,
x
,
y
.
dimshuffle
(
0
),
b
)
if
z
.
broadcastable
==
y
.
broadcastable
==
(
False
,
True
):
return
[
r
.
dimshuffle
(
0
,
'x'
)]
r
=
gemv_no_inplace
(
z
.
dimshuffle
(
0
),
a
,
x
,
y
.
dimshuffle
(
0
),
b
)
return
[
r
.
dimshuffle
(
0
,
'x'
)]
@local_optimizer
([
gemm_no_inplace
])
@local_optimizer
([
gemm_no_inplace
])
...
@@ -1680,26 +1682,27 @@ def local_gemm_to_ger(node):
...
@@ -1680,26 +1682,27 @@ def local_gemm_to_ger(node):
if
node
.
op
==
gemm_no_inplace
:
if
node
.
op
==
gemm_no_inplace
:
z
,
a
,
x
,
y
,
b
=
node
.
inputs
z
,
a
,
x
,
y
,
b
=
node
.
inputs
if
x
.
broadcastable
[
1
]
and
y
.
broadcastable
[
0
]:
if
x
.
broadcastable
[
1
]
and
y
.
broadcastable
[
0
]:
# x and y are both vectors so this might qualifies for a GER
with
inherit_stack_trace
(
node
.
outputs
):
xv
=
x
.
dimshuffle
(
0
)
# x and y are both vectors so this might qualifies for a GER
yv
=
y
.
dimshuffle
(
1
)
xv
=
x
.
dimshuffle
(
0
)
try
:
yv
=
y
.
dimshuffle
(
1
)
bval
=
T
.
get_scalar_constant_value
(
b
)
try
:
except
T
.
NotScalarConstantError
:
bval
=
T
.
get_scalar_constant_value
(
b
)
# b isn't a constant, GEMM is doing useful pre-scaling
except
T
.
NotScalarConstantError
:
return
# b isn't a constant, GEMM is doing useful pre-scaling
return
if
bval
==
1
:
# best case a natural GER
rval
=
ger
(
z
,
a
,
xv
,
yv
)
if
bval
==
1
:
# best case a natural GER
return
[
rval
]
rval
=
ger
(
z
,
a
,
xv
,
yv
)
elif
bval
==
0
:
# GER on zeros_like should be faster than GEMM
return
[
rval
]
zeros
=
T
.
zeros
([
x
.
shape
[
0
],
y
.
shape
[
1
]],
x
.
dtype
)
elif
bval
==
0
:
# GER on zeros_like should be faster than GEMM
rval
=
ger
(
zeros
,
a
,
xv
,
yv
)
zeros
=
T
.
zeros
([
x
.
shape
[
0
],
y
.
shape
[
1
]],
x
.
dtype
)
return
[
rval
]
rval
=
ger
(
zeros
,
a
,
xv
,
yv
)
else
:
return
[
rval
]
# if bval is another constant, then z is being usefully
else
:
# pre-scaled and GER isn't really the right tool for the job.
# if bval is another constant, then z is being usefully
return
# pre-scaled and GER isn't really the right tool for the job.
return
# TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
# TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
...
@@ -1708,37 +1711,38 @@ def local_gemm_to_ger(node):
...
@@ -1708,37 +1711,38 @@ def local_gemm_to_ger(node):
def
local_dot22_to_ger_or_gemv
(
node
):
def
local_dot22_to_ger_or_gemv
(
node
):
"""dot22 computing an outer-product -> GER."""
"""dot22 computing an outer-product -> GER."""
if
node
.
op
==
_dot22
:
if
node
.
op
==
_dot22
:
x
,
y
=
node
.
inputs
with
inherit_stack_trace
(
node
.
outputs
):
xb
=
x
.
broadcastable
x
,
y
=
node
.
inputs
yb
=
y
.
broadcastable
xb
=
x
.
broadcastable
one
=
T
.
as_tensor_variable
(
np
.
asarray
(
1
,
dtype
=
x
.
dtype
))
yb
=
y
.
broadcastable
zero
=
T
.
as_tensor_variable
(
np
.
asarray
(
0
,
dtype
=
x
.
dtype
))
one
=
T
.
as_tensor_variable
(
np
.
asarray
(
1
,
dtype
=
x
.
dtype
))
if
xb
[
1
]
and
yb
[
0
]:
zero
=
T
.
as_tensor_variable
(
np
.
asarray
(
0
,
dtype
=
x
.
dtype
))
# x and y are both vectors so this might qualifies for a GER
if
xb
[
1
]
and
yb
[
0
]:
xv
=
x
.
dimshuffle
(
0
)
# x and y are both vectors so this might qualifies for a GER
yv
=
y
.
dimshuffle
(
1
)
xv
=
x
.
dimshuffle
(
0
)
zeros
=
T
.
zeros
([
x
.
shape
[
0
],
y
.
shape
[
1
]],
dtype
=
x
.
dtype
)
yv
=
y
.
dimshuffle
(
1
)
rval
=
ger
(
zeros
,
one
,
xv
,
yv
)
zeros
=
T
.
zeros
([
x
.
shape
[
0
],
y
.
shape
[
1
]],
dtype
=
x
.
dtype
)
return
[
rval
]
rval
=
ger
(
zeros
,
one
,
xv
,
yv
)
if
xb
[
0
]
and
yb
[
1
]:
return
[
rval
]
# x and y are both vectors so this qualifies for a sdot / ddot
if
xb
[
0
]
and
yb
[
1
]:
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
# x and y are both vectors so this qualifies for a sdot / ddot
xv
=
x
.
dimshuffle
(
1
)
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
1
)
xv
=
x
.
dimshuffle
(
1
)
rval
=
gemv_no_inplace
(
zeros
,
one
,
y
.
T
,
xv
,
zero
)
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
1
)
return
[
rval
.
dimshuffle
(
'x'
,
0
)]
rval
=
gemv_no_inplace
(
zeros
,
one
,
y
.
T
,
xv
,
zero
)
if
xb
[
0
]
and
not
yb
[
0
]
and
not
yb
[
1
]:
return
[
rval
.
dimshuffle
(
'x'
,
0
)]
# x is vector, y is matrix so try gemv
if
xb
[
0
]
and
not
yb
[
0
]
and
not
yb
[
1
]:
xv
=
x
.
dimshuffle
(
1
)
# x is vector, y is matrix so try gemv
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
y
.
shape
[
1
])
xv
=
x
.
dimshuffle
(
1
)
rval
=
gemv_no_inplace
(
zeros
,
one
,
y
.
T
,
xv
,
zero
)
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
y
.
shape
[
1
])
return
[
rval
.
dimshuffle
(
'x'
,
0
)]
rval
=
gemv_no_inplace
(
zeros
,
one
,
y
.
T
,
xv
,
zero
)
if
not
xb
[
0
]
and
not
xb
[
1
]
and
yb
[
1
]:
return
[
rval
.
dimshuffle
(
'x'
,
0
)]
# x is matrix, y is vector, try gemv
if
not
xb
[
0
]
and
not
xb
[
1
]
and
yb
[
1
]:
yv
=
y
.
dimshuffle
(
0
)
# x is matrix, y is vector, try gemv
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
x
.
shape
[
0
])
yv
=
y
.
dimshuffle
(
0
)
rval
=
gemv_no_inplace
(
zeros
,
one
,
x
,
yv
,
zero
)
zeros
=
T
.
AllocEmpty
(
x
.
dtype
)(
x
.
shape
[
0
])
return
[
rval
.
dimshuffle
(
0
,
'x'
)]
rval
=
gemv_no_inplace
(
zeros
,
one
,
x
,
yv
,
zero
)
return
[
rval
.
dimshuffle
(
0
,
'x'
)]
#################################
#################################
...
...
theano/tensor/opt_uncanonicalize.py
浏览文件 @
7a6d676f
...
@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor
...
@@ -43,6 +43,7 @@ from theano.tensor import DimShuffle, Subtensor
from
theano.tensor.opt
import
register_uncanonicalize
from
theano.tensor.opt
import
register_uncanonicalize
from
theano
import
scalar
as
scal
from
theano
import
scalar
as
scal
from
theano.gof.opt
import
copy_stack_trace
_logger
=
logging
.
getLogger
(
'theano.tensor.opt'
)
_logger
=
logging
.
getLogger
(
'theano.tensor.opt'
)
...
@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
...
@@ -57,10 +58,13 @@ def local_max_and_argmax(node):
axis
=
node
.
op
.
get_params
(
node
)
axis
=
node
.
op
.
get_params
(
node
)
if
len
(
node
.
outputs
[
1
]
.
clients
)
==
0
:
if
len
(
node
.
outputs
[
1
]
.
clients
)
==
0
:
new
=
CAReduce
(
scal
.
maximum
,
axis
)(
node
.
inputs
[
0
])
new
=
CAReduce
(
scal
.
maximum
,
axis
)(
node
.
inputs
[
0
])
copy_stack_trace
(
node
.
outputs
[
0
],
new
)
return
[
new
,
None
]
return
[
new
,
None
]
if
len
(
node
.
outputs
[
0
]
.
clients
)
==
0
:
if
len
(
node
.
outputs
[
0
]
.
clients
)
==
0
:
return
[
None
,
T
.
Argmax
(
axis
)(
node
.
inputs
[
0
])]
new
=
T
.
Argmax
(
axis
)(
node
.
inputs
[
0
])
copy_stack_trace
(
node
.
outputs
[
0
],
new
)
return
[
None
,
new
]
@register_uncanonicalize
@register_uncanonicalize
...
@@ -84,8 +88,8 @@ def local_max_to_min(node):
...
@@ -84,8 +88,8 @@ def local_max_to_min(node):
max
.
owner
.
op
.
scalar_op
==
scal
.
maximum
):
max
.
owner
.
op
.
scalar_op
==
scal
.
maximum
):
neg
=
max
.
owner
.
inputs
[
0
]
neg
=
max
.
owner
.
inputs
[
0
]
if
neg
.
owner
and
neg
.
owner
.
op
==
T
.
neg
:
if
neg
.
owner
and
neg
.
owner
.
op
==
T
.
neg
:
return
[
CAReduce
(
scal
.
minimum
,
new
=
CAReduce
(
scal
.
minimum
,
max
.
owner
.
op
.
axis
)(
neg
.
owner
.
inputs
[
0
])
max
.
owner
.
op
.
axis
)(
neg
.
owner
.
inputs
[
0
]
)]
return
[
copy_stack_trace
(
node
.
outputs
[
0
],
new
)]
return
False
return
False
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论