Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
907d1868
提交
907d1868
authored
5月 19, 2015
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2901 from abergeron/f16_lstm
Work to make the DLT LSTM work with float16.
上级
e06617bd
6cee82db
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
12 个修改的文件
包含
229 行增加
和
173 行删除
+229
-173
debugmode.py
theano/compile/debugmode.py
+13
-34
ops.py
theano/compile/ops.py
+1
-0
op.py
theano/gof/op.py
+74
-64
kernel_codegen.py
theano/sandbox/gpuarray/kernel_codegen.py
+37
-25
nnet.py
theano/sandbox/gpuarray/nnet.py
+0
-0
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+33
-7
type.py
theano/sandbox/gpuarray/type.py
+13
-0
rng_mrg.py
theano/sandbox/rng_mrg.py
+25
-25
basic.py
theano/tensor/basic.py
+1
-0
sigm.py
theano/tensor/nnet/sigm.py
+26
-15
opt.py
theano/tensor/opt.py
+3
-2
subtensor.py
theano/tensor/subtensor.py
+3
-1
没有找到文件。
theano/compile/debugmode.py
浏览文件 @
907d1868
...
@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
...
@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
compute_map
=
{}
for
k
in
node
.
inputs
:
compute_map
[
k
]
=
[
True
]
for
k
in
node
.
outputs
:
compute_map
[
k
]
=
[
False
]
# Some Ops define a make_thunk with the expectation that
# Some Ops define a make_thunk with the expectation that
# it will be called before the C code is compiled, because
# it will be called before the C code is compiled, because
# the compilation of some dependency is triggered there.
# the compilation of some dependency is triggered there.
thunk_other
=
None
thunk_other
=
None
if
get_unbound_function
(
node
.
op
.
make_thunk
)
not
in
default_make_thunk
:
if
get_unbound_function
(
node
.
op
.
make_thunk
)
not
in
default_make_thunk
:
compute_map
=
{}
for
k
in
node
.
inputs
:
compute_map
[
k
]
=
[
True
]
for
k
in
node
.
outputs
:
compute_map
[
k
]
=
[
False
]
thunk
=
node
.
op
.
make_thunk
(
node
,
thunk
=
node
.
op
.
make_thunk
(
node
,
storage_map
,
storage_map
,
compute_map
,
compute_map
,
...
@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
...
@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
raise
utils
.
MethodNotDefined
()
raise
utils
.
MethodNotDefined
()
# Ops that do not inherit from gof.op.Op don't have certain
# Ops that do not inherit from gof.op.Op don't have certain
# methods defined that the CLinker expects (Scan is an
# methods defined that the CLinker expects (Scan is an
# ex
ma
ple, ifelse is another of such classes that inherit
# ex
am
ple, ifelse is another of such classes that inherit
# directly from PureOp)
# directly from PureOp)
if
not
isinstance
(
node
.
op
,
gof
.
op
.
Op
):
if
not
isinstance
(
node
.
op
,
gof
.
op
.
Op
):
raise
utils
.
MethodNotDefined
()
raise
utils
.
MethodNotDefined
()
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
# The toposort isn't a stochastic order as it contain only one node.
thunk
=
node
.
op
.
make_c_thunk
(
node
,
storage_map
,
compute_map
,
e
.
toposort
=
lambda
:
list
(
e
.
apply_nodes
)
no_recycling
)
# Specifically... e.nodes is a set, but of only 1 element
cl
=
CLinker
()
.
accept
(
e
,
[
r
for
r
,
r2
in
zip
(
e
.
outputs
,
node
.
outputs
)
if
r2
in
no_recycling
])
thunk
,
node_input_filters
,
node_output_filters
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
thunk
.
inputs
=
node_input_storage
thunk
.
outputs
=
node_output_storage
thunks_c
.
append
(
thunk
)
thunks_c
.
append
(
thunk
)
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
thunks_c
.
append
(
None
)
thunks_c
.
append
(
None
)
...
@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
...
@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
# consider that we don't have a python implementation
# consider that we don't have a python implementation
if
((
self
.
maker
.
mode
.
check_py_code
or
thunks_c
[
-
1
]
is
None
)
and
if
((
self
.
maker
.
mode
.
check_py_code
or
thunks_c
[
-
1
]
is
None
)
and
node
.
op
.
perform
.
func_code
!=
gof
.
op
.
PureOp
.
perform
.
func_code
):
node
.
op
.
perform
.
func_code
!=
gof
.
op
.
PureOp
.
perform
.
func_code
):
p
=
node
.
op
.
perform
thunk
=
node
.
op
.
make_py_thunk
(
node
,
storage_map
,
compute_map
,
ctx
=
node
.
run_context
()
no_recycling
)
if
ctx
is
graph
.
NoContext
:
thunk
=
(
lambda
p
=
p
,
i
=
node_input_storage
,
o
=
node_output_storage
,
n
=
node
:
p
(
n
,
[
x
[
0
]
for
x
in
i
],
o
))
else
:
ctx_val
=
node
.
context_type
.
filter
(
ctx
)
thunk
=
(
lambda
p
=
p
,
i
=
node_input_storage
,
o
=
node_output_storage
,
ctx
=
ctx_val
,
n
=
node
:
p
(
n
,
[
x
[
0
]
for
x
in
i
],
o
,
ctx
))
thunk
.
inputs
=
node_input_storage
thunk
.
outputs
=
node_output_storage
thunk
.
perform
=
p
thunks_py
.
append
(
thunk
)
thunks_py
.
append
(
thunk
)
else
:
else
:
thunks_py
.
append
(
None
)
thunks_py
.
append
(
None
)
...
...
theano/compile/ops.py
浏览文件 @
907d1868
...
@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
...
@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
..note: works inplace and works for CudaNdarrayType
..note: works inplace and works for CudaNdarrayType
"""
"""
view_map
=
{
0
:
[
0
]}
view_map
=
{
0
:
[
0
]}
_f16_ok
=
True
# Mapping from Type to C code (and version) to use.
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
# the output variable is %(oname)s.
...
...
theano/gof/op.py
浏览文件 @
907d1868
...
@@ -699,78 +699,55 @@ class Op(utils.object2, PureOp, CLinkerOp):
...
@@ -699,78 +699,55 @@ class Op(utils.object2, PureOp, CLinkerOp):
else
:
else
:
return
NotImplemented
return
NotImplemented
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_
c_
thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
"""
:param node: something previously returned by self.make_node
Like make_thunk, but will only try to make a C thunk.
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
"""
logger
=
logging
.
getLogger
(
'theano.gof.op.Op'
)
logger
=
logging
.
getLogger
(
'theano.gof.op.Op'
)
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_input_compute
=
[
compute_map
[
r
]
for
r
in
node
.
inputs
]
node_output_compute
=
[
compute_map
[
r
]
for
r
in
node
.
outputs
]
if
self
.
_op_use_c_code
:
# float16 gets special treatment since running
try
:
# unprepared C code will get bad results.
# float16 get special treatment since running
if
not
getattr
(
self
,
'_f16_ok'
,
False
):
# unprepared C code will get bad results.
def
is_f16
(
t
):
if
not
getattr
(
self
,
'_f16_ok'
,
False
):
return
getattr
(
t
,
'dtype'
,
''
)
==
'float16'
def
is_f16
(
t
):
return
getattr
(
t
,
'dtype'
,
''
)
==
'float16'
if
(
any
(
is_f16
(
i
.
type
)
for
i
in
node
.
inputs
)
or
any
(
is_f16
(
o
.
type
)
for
o
in
node
.
outputs
)):
if
(
any
(
is_f16
(
i
.
type
)
for
i
in
node
.
inputs
)
or
print
(
"Disabling C code for
%
s due to unsupported "
any
(
is_f16
(
o
.
type
)
for
o
in
node
.
outputs
)):
"float16"
%
(
self
,))
print
(
"Disabling C code for
%
s due to unsupported "
raise
NotImplementedError
(
"float16"
)
"float16"
%
(
self
,))
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
raise
NotImplementedError
(
"float16"
)
e_no_recycling
=
[
new_o
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
for
(
new_o
,
old_o
)
in
zip
(
e
.
outputs
,
node
.
outputs
)
if
old_o
in
no_recycling
]
e_no_recycling
=
[
new_o
cl
=
theano
.
gof
.
cc
.
CLinker
()
.
accept
(
e
,
for
(
new_o
,
old_o
)
in
zip
(
e
.
outputs
,
node
.
outputs
)
no_recycling
=
e_no_recycling
)
if
old_o
in
no_recycling
]
cl
=
theano
.
gof
.
cc
.
CLinker
()
.
accept
(
e
,
logger
.
debug
(
'Trying CLinker.make_thunk'
)
no_recycling
=
e_no_recycling
)
outputs
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
logger
.
debug
(
'Trying CLinker.make_thunk'
)
fill_storage
,
node_input_filters
,
node_output_filters
=
outputs
outputs
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
def
rval
():
fill_storage
,
node_input_filters
,
node_output_filters
=
outputs
fill_storage
()
for
o
in
node
.
outputs
:
def
rval
():
compute_map
[
o
][
0
]
=
True
fill_storage
()
for
o
in
node
.
outputs
:
rval
.
cthunk
=
fill_storage
.
cthunk
compute_map
[
o
][
0
]
=
True
rval
.
inputs
=
node_input_storage
rval
.
outputs
=
node_output_storage
rval
.
cthunk
=
fill_storage
.
cthunk
rval
.
lazy
=
False
rval
.
inputs
=
node_input_storage
return
rval
rval
.
outputs
=
node_output_storage
rval
.
lazy
=
False
return
rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del
rval
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
logger
.
debug
(
'Falling back on perform'
)
# condition: either there was no c_code, or it failed
def
make_py_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
Like make_thunk() but only makes python thunks.
"""
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
p
=
node
.
op
.
perform
p
=
node
.
op
.
perform
...
@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
...
@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval
.
lazy
=
False
rval
.
lazy
=
False
return
rval
return
rval
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
logger
=
logging
.
getLogger
(
'theano.gof.op.Op'
)
if
self
.
_op_use_c_code
:
try
:
return
self
.
make_c_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
logger
.
debug
(
'Falling back on perform'
)
# condition: either there was no c_code, or it failed
return
self
.
make_py_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
def
get_test_value
(
v
):
def
get_test_value
(
v
):
"""
"""
...
...
theano/sandbox/gpuarray/kernel_codegen.py
浏览文件 @
907d1868
...
@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
...
@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
]
@code_version
((
1
,))
@code_version
((
2
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
manner_fn
,
manner_init
,
manner_fn
,
manner_init
,
b
=
''
,
stride_b
=
''
,
dtype
=
'float32'
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
"""Return C++ code for a function that reduces a contiguous buffer.
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype)
:param buf: buffer pointer of size warpSize * sizeof(dtype)
:param x: input data
:param stride_x: input data stride
:param load_x: wrapper to read from x
:param pos: index of executing thread
:param pos: index of executing thread
:param count: number of executing threads
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the output
:param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a
:param manner_fn: a function that accepts strings of arguments a
...
@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
...
@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
"""
"""
if
b
:
if
b
:
init
=
manner_init
(
"
%(
x)
s[
%(pos)
s *
%(stride_x)
s]
+"
init
=
manner_init
(
"
%(
load_x)
s(
%(x)
s[
%(pos)
s *
%(stride_x)
s])
+"
"
%(
b)
s[
%(pos)
s *
%(stride_b)
s]
"
%
locals
())
"
%(
load_b)
s(
%(b)
s[
%(pos)
s *
%(stride_b)
s])
"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
x)
s[i *
%(stride_x)
s]
+ "
manner_init
(
"
%(
load_x)
s(
%(x)
s[i *
%(stride_x)
s])
+ "
"
%(
b)
s[i *
%(stride_b)
s]
"
%
"
%(
load_b)
s(
%(b)
s[i *
%(stride_b)
s])
"
%
locals
()))
locals
()))
else
:
else
:
init
=
manner_init
(
"
%(
x)
s[
%(pos)
s *
%(stride_x)
s]
"
%
locals
())
init
=
manner_init
(
"
%(
load_x)
s(
%(x)
s[
%(pos)
s *
%(stride_x)
s])
"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
x)
s[i *
%(stride_x)
s]
"
%
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
load_x)
s(
%(x)
s[i *
%(stride_x)
s])
"
%
locals
()))
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
...
@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
...
@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version
(
inline_reduce_fixed_shared
.
code_version
)
@code_version
(
inline_reduce_fixed_shared
.
code_version
)
def
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
def
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
b
=
''
,
stride_b
=
''
,
dtype
=
'float32'
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
return
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
dtype
=
'float32'
):
return
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
lambda
a
,
b
:
"max(
%
s,
%
s)"
%
(
a
,
b
),
lambda
a
,
b
:
"max(
%
s,
%
s)"
%
(
a
,
b
),
lambda
a
:
a
,
lambda
a
:
a
,
b
,
stride_b
,
dtype
)
b
,
stride_b
,
load_b
,
dtype
)
@code_version
((
1
,)
+
inline_reduce_max
.
code_version
+
@code_version
((
2
,)
+
inline_reduce_max
.
code_version
+
inline_reduce_sum
.
code_version
)
inline_reduce_sum
.
code_version
)
def
inline_softmax_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
def
inline_softmax_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
sm
,
sm_stride
,
sm
,
sm_stride
,
write_sm
,
threadPos
,
threadCount
,
threadPos
,
threadCount
,
b
=
''
,
stride_b
=
''
,
dtype
=
"float32"
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
"float32"
):
"""
"""
:param N: length of the buffer, atleast waprSize(32).
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param stride_x: the stride between each element in x
:param load_x: wrapper to read from x
:param sm: a ptr to the gpu memory to store the result
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param sm_stride: the stride between eash sm element
:param write_sm: wrapper before writing to sm
:param threadPos: index of executing thread
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32
:param dtype: Optional, the dtype of the softmax's output if not float32
:Precondition: buf is empty
:Precondition: buf is empty
...
@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
...
@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
"""
"""
ret
=
[
ret
=
[
# get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
load_x
,
threadPos
,
threadCount
,
b
,
stride_b
,
threadPos
,
threadCount
,
b
,
stride_b
,
load_b
,
dtype
),
dtype
),
'__syncthreads()'
,
'__syncthreads()'
,
(
'npy_
%
s row_max = '
+
buf
+
'[0]'
)
%
dtype
,
(
'npy_
%
s row_max = '
+
buf
+
'[0]'
)
%
dtype
,
'__syncthreads()'
,
'__syncthreads()'
,
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
threadPos
,
threadCount
,
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
threadPos
,
threadCount
,
lambda
a
,
b
:
"
%
s +
%
s"
%
(
a
,
b
),
lambda
a
,
b
:
"
%
s +
%
s"
%
(
a
,
b
),
lambda
a
:
"exp(
%
s - row_max)"
%
a
,
lambda
a
:
"exp(
%
s - row_max)"
%
a
,
b
,
stride_b
,
dtype
),
b
,
stride_b
,
load_b
,
dtype
),
'__syncthreads()'
,
'__syncthreads()'
,
(
'npy_
%
s row_sum = '
+
buf
+
'[0]'
)
%
dtype
,
(
'npy_
%
s row_sum = '
+
buf
+
'[0]'
)
%
dtype
,
'__syncthreads()'
,
'__syncthreads()'
,
...
@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
...
@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
if
b
:
if
b
:
ret
+=
[
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"
exp(
%(x)
s[tx *
%(stride_x)
s]
+"
"
%(write_sm)
s(exp(
%(load_x)
s(
%(x)
s[tx *
%(stride_x)
s])
+"
"
%(
b)
s[tx *
%(stride_b)
s]
- row_max)"
"
%(
load_b)
s(
%(b)
s[tx *
%(stride_b)
s])
- row_max)"
" / row_sum"
%
locals
()]
" / row_sum
)
"
%
locals
()]
else
:
else
:
ret
+=
[
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"exp(
%(x)
s[tx *
%(stride_x)
s] - row_max) / row_sum"
%
locals
()]
"
%(write_sm)
s(exp(
%(load_x)
s(
%(x)
s[tx *
%(stride_x)
s]) - row_max)"
" / row_sum)"
%
locals
()]
ret
+=
[
ret
+=
[
"}"
,
"}"
,
'__syncthreads()'
,
'__syncthreads()'
,
...
...
theano/sandbox/gpuarray/nnet.py
浏览文件 @
907d1868
差异被折叠。
点击展开。
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
907d1868
...
@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
...
@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
The helper methods like do_type_checking, copy_of_x, etc. specialize
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
the c_code for this Op.
"""
"""
@property
def
_f16_ok
(
self
):
return
self
.
iadd_node
.
op
.
_f16_ok
def
c_headers
(
self
):
def
c_headers
(
self
):
return
self
.
iadd_node
.
op
.
c_headers
()
return
self
.
iadd_node
.
op
.
c_headers
()
...
@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
...
@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
PyGpuArrayObject* src){
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
PyGpuArrayObject* ret = NULL;
"""
%
locals
()
"""
%
locals
()
# def c_code(self, node, name, inputs, outputs, sub):
inputs
=
[
"dst"
,
"src"
]
inputs
=
[
"dst"
,
"src"
]
outputs
=
[
"ret"
]
outputs
=
[
"ret"
]
sub
=
{
"fail"
:
"return NULL;"
}
sub
=
{
"fail"
:
"return NULL;"
}
...
@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
...
@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
return
ret
return
ret
def
add_to_zview
(
self
,
nodename
,
x
,
fail
):
def
add_to_zview
(
self
,
nodename
,
x
,
fail
):
# TODO
return
"""
return
"""
PyGpuArrayObject * add_result = inc_sub_iadd_
%(nodename)
s(zview,
%(x)
s);
PyGpuArrayObject * add_result = inc_sub_iadd_
%(nodename)
s(zview,
%(x)
s);
...
@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
...
@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
if
not
parent_version
or
not
elemwise_version
:
if
not
parent_version
or
not
elemwise_version
:
return
return
return
parent_version
+
elemwise_version
+
(
1
,)
return
parent_version
+
elemwise_version
+
(
2
,)
class
GpuAdvancedIncSubtensor1
(
HideC
,
tensor
.
AdvancedIncSubtensor1
):
class
GpuAdvancedIncSubtensor1
(
HideC
,
tensor
.
AdvancedIncSubtensor1
):
...
@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
...
@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
def
getInplElemwiseAdditionKernel
(
self
,
a
,
b
):
def
getInplElemwiseAdditionKernel
(
self
,
a
,
b
):
if
a
.
dtype
==
'float16'
or
b
.
dtype
==
'float16'
:
raise
NotImplementedError
(
'float16 is not supported by pygpu '
'elemwise'
)
a_arg
=
pygpu
.
tools
.
as_argument
(
a
,
'a'
)
a_arg
=
pygpu
.
tools
.
as_argument
(
a
,
'a'
)
b_arg
=
pygpu
.
tools
.
as_argument
(
b
,
'b'
)
b_arg
=
pygpu
.
tools
.
as_argument
(
b
,
'b'
)
args
=
[
a_arg
,
b_arg
]
args
=
[
a_arg
,
b_arg
]
...
@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
...
@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
only avail on compute capability 2.0 and more recent.
"""
"""
_f16_ok
=
True
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1
.
__init__
(
self
,
inplace
,
set_instead_of_inc
)
def
make_node
(
self
,
x
,
y
,
ilist
):
def
make_node
(
self
,
x
,
y
,
ilist
):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
...
@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
...
@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_out
=
numpy
.
dtype
(
dtype_out
)
.
itemsize
itemsize_out
=
numpy
.
dtype
(
dtype_out
)
.
itemsize
return
"""
return
"""
/*
* This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array
* has an uneven number of elements. The actual value at that spot
* will not be modified.
*/
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
npy_uint32 old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410);
}
__global__ void k_vector_add_fast(int numRowsX,
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int numColsX,
int stridesX0,
int stridesX0,
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
907d1868
...
@@ -136,6 +136,12 @@ class GpuArrayType(Type):
...
@@ -136,6 +136,12 @@ class GpuArrayType(Type):
raise
NotImplementedError
(
raise
NotImplementedError
(
"GpuArrayType.values_eq_approx() don't implemented the"
"GpuArrayType.values_eq_approx() don't implemented the"
" allow_remove_inf and allow_remove_nan parameter"
)
" allow_remove_inf and allow_remove_nan parameter"
)
if
a
.
dtype
==
'float16'
or
b
.
dtype
==
'float16'
:
an
=
numpy
.
asarray
(
a
)
bn
=
numpy
.
asarray
(
b
)
return
tensor
.
TensorType
.
values_eq_approx
(
an
,
bn
,
allow_remove_inf
=
allow_remove_inf
,
allow_remove_nan
=
allow_remove_nan
,
rtol
=
rtol
,
atol
=
atol
)
narrow
=
'float32'
,
'complex64'
narrow
=
'float32'
,
'complex64'
if
(
str
(
a
.
dtype
)
in
narrow
)
or
(
str
(
b
.
dtype
)
in
narrow
):
if
(
str
(
a
.
dtype
)
in
narrow
)
or
(
str
(
b
.
dtype
)
in
narrow
):
atol_
=
theano
.
tensor
.
basic
.
float32_atol
atol_
=
theano
.
tensor
.
basic
.
float32_atol
...
@@ -153,6 +159,13 @@ class GpuArrayType(Type):
...
@@ -153,6 +159,13 @@ class GpuArrayType(Type):
locals
())
locals
())
return
numpy
.
asarray
(
res
)
.
all
()
return
numpy
.
asarray
(
res
)
.
all
()
@staticmethod
def
may_share_memory
(
a
,
b
):
if
(
not
isinstance
(
a
,
gpuarray
.
GpuArray
)
or
not
isinstance
(
b
,
gpuarray
.
GpuArray
)):
return
False
return
pygpu
.
gpuarray
.
may_share_memory
(
a
,
b
)
def
value_zeros
(
self
,
shape
):
def
value_zeros
(
self
,
shape
):
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
)
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
)
...
...
theano/sandbox/rng_mrg.py
浏览文件 @
907d1868
...
@@ -28,6 +28,7 @@ if cuda_available:
...
@@ -28,6 +28,7 @@ if cuda_available:
from
theano.sandbox.gpuarray.basic_ops
import
GpuKernelBase
,
Kernel
from
theano.sandbox.gpuarray.basic_ops
import
GpuKernelBase
,
Kernel
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.fp16_help
import
write_w
def
matVecModM
(
A
,
s
,
m
):
def
matVecModM
(
A
,
s
,
m
):
...
@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
...
@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
def
perform
(
self
,
node
,
inp
,
out
):
def
perform
(
self
,
node
,
inp
,
out
):
rstate
,
size
=
inp
rstate
,
size
=
inp
o_rstate
,
o_sample
=
out
o_rstate
,
o_sample
=
out
numpy_version
=
numpy
.
__version__
.
split
(
'.'
)
if
(
not
self
.
warned_numpy_version
and
int
(
numpy_version
[
0
])
<=
1
and
int
(
numpy_version
[
1
])
<
3
):
print
(
"Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
)
self
.
warned_numpy_version
=
True
n_elements
=
1
n_elements
=
1
rstate
=
numpy
.
asarray
(
rstate
)
# bring state from GPU if necessary
rstate
=
numpy
.
asarray
(
rstate
)
# bring state from GPU if necessary
...
@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):
...
@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
rstate
,
size
=
inp
rstate
,
size
=
inp
# If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops
# not defining C code correctly).
assert
isinstance
(
node
.
inputs
[
0
]
.
type
,
TensorType
)
o_rstate
,
o_sample
=
out
o_rstate
,
o_sample
=
out
if
self
.
inplace
:
if
self
.
inplace
:
o_rstate_requirement
=
'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
o_rstate_requirement
=
'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
...
@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
...
@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
class
GPUA_mrg_uniform
(
GpuKernelBase
,
mrg_uniform_base
):
class
GPUA_mrg_uniform
(
GpuKernelBase
,
mrg_uniform_base
):
# GpuArray version
# GpuArray version
_f16_ok
=
True
@classmethod
@classmethod
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
...
@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return
super
(
GPUA_mrg_uniform
,
self
)
.
c_headers
()
+
[
'numpy_compat.h'
]
return
super
(
GPUA_mrg_uniform
,
self
)
.
c_headers
()
+
[
'numpy_compat.h'
]
def
gpu_kernels
(
self
,
node
,
name
):
def
gpu_kernels
(
self
,
node
,
name
):
if
self
.
output_type
.
dtype
==
'float32'
:
write
=
write_w
(
self
.
output_type
.
dtype
)
if
self
.
output_type
.
dtype
==
'float16'
:
otype
=
'ga_half'
# limit the values of the state that we use.
mask
=
'& 0x7fff'
NORM
=
'3.0518e-05f'
# numpy.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# numpy.float16(number * (M1 & 0x7fff)) < 1.0
elif
self
.
output_type
.
dtype
==
'float32'
:
otype
=
'float'
otype
=
'float'
mask
=
''
NORM
=
'4.6566126e-10f'
# numpy.float32(1.0/(2**31+65))
NORM
=
'4.6566126e-10f'
# numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
# numpy.float32(number * M1) < 1.0
el
se
:
el
if
self
.
output_type
.
dtype
==
'float64'
:
otype
=
'double'
otype
=
'double'
mask
=
''
NORM
=
'4.656612873077392578125e-10'
NORM
=
'4.656612873077392578125e-10'
else
:
raise
ValueError
(
'Unsupported data type for output'
,
self
.
output_type
.
dtype
)
code
=
"""
code
=
"""
KERNEL void mrg_uniform(
KERNEL void mrg_uniform(
GLOBAL_MEM
%(otype)
s *sample_data,
GLOBAL_MEM
%(otype)
s *sample_data,
...
@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x21 = y2;
x21 = y2;
if (x11 <= x21) {
if (x11 <= x21) {
sample_data[i] =
(x11 - x21 + M1) *
%(NORM)
s
;
sample_data[i] =
%(write)
s(((x11 - x21 + M1)
%(mask)
s) *
%(NORM)
s)
;
}
}
else
else
{
{
sample_data[i] =
(x11 - x21) *
%(NORM)
s
;
sample_data[i] =
%(write)
s(((x11 - x21)
%(mask)
s) *
%(NORM)
s)
;
}
}
}
}
...
@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
o_type_num
=
numpy
.
asarray
(
0
,
dtype
=
self
.
output_type
.
dtype
)
.
dtype
.
num
o_type_num
=
numpy
.
asarray
(
0
,
dtype
=
self
.
output_type
.
dtype
)
.
dtype
.
num
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
kname
=
self
.
gpu_kernels
(
node
,
nodename
)[
0
]
.
objvar
kname
=
self
.
gpu_kernels
(
node
,
nodename
)[
0
]
.
objvar
otypecode
=
str
(
self
.
output_type
.
typecode
)
if
self
.
output_type
.
dtype
==
'float32'
:
otype
=
'float'
otypecode
=
'GA_FLOAT'
else
:
otype
=
'double'
otypecode
=
'GA_DOUBLE'
return
"""
return
"""
//////// <code generated by mrg_uniform>
size_t odims[
%(ndim)
s];
size_t odims[
%(ndim)
s];
unsigned int n_elements = 1;
unsigned int n_elements = 1;
unsigned int n_streams;
unsigned int n_streams;
...
@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
...
@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)
s
%(fail)
s
}
}
}
}
//////// </ code generated by mrg_uniform>
"""
%
locals
()
"""
%
locals
()
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,
self
.
GpuKernelBase_version
)
return
(
6
,
self
.
GpuKernelBase_version
)
def
guess_n_streams
(
size
,
warn
=
False
):
def
guess_n_streams
(
size
,
warn
=
False
):
...
...
theano/tensor/basic.py
浏览文件 @
907d1868
...
@@ -3842,6 +3842,7 @@ class Reshape(Op):
...
@@ -3842,6 +3842,7 @@ class Reshape(Op):
The number of dimensions to which to reshape to (ndim) must be
The number of dimensions to which to reshape to (ndim) must be
known at graph build time."""
known at graph build time."""
view_map
=
{
0
:
[
0
]}
# output 0 is potentially aliased to inputs [0]
view_map
=
{
0
:
[
0
]}
# output 0 is potentially aliased to inputs [0]
_f16_ok
=
True
check_input
=
False
check_input
=
False
...
...
theano/tensor/nnet/sigm.py
浏览文件 @
907d1868
...
@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
...
@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
# We add boundary checks prevent exp from generating inf or
# We add boundary checks prevent exp from generating inf or
# 0. The reset of the logic always generate 0 or 1 in those
# 0. The reset of the logic always generate 0 or 1 in those
# cases. This is a speed optimization.
# cases. This is a speed optimization.
# The constants were obtained by looking at the output of python commands like:
# The constants were obtained by looking at the output of
"""
# python commands like:
import numpy, theano
#
dt='float32' # or float64
# import numpy, theano
for i in xrange(750):
# dt='float32' # or float64
print i, repr(theano._asarray(1.0, dtype=dt) /
# for i in xrange(750):
(theano._asarray(1.0, dtype=dt) +
# print i, repr(theano._asarray(1.0, dtype=dt) /
numpy.exp(-theano._asarray([i,-i], dtype=dt))))
# (theano._asarray(1.0, dtype=dt) +
"""
# numpy.exp(-theano._asarray([i,-i], dtype=dt))))
if
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
:
# float16 limits: -11.0, 7.0f
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if
(
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
or
node
.
inputs
[
0
]
.
type
==
scalar
.
float16
):
return
"""
%(z)
s =
%(x)
s < -88.0f ? 0.0 :
%(x)
s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-
%(x)
s));"""
%
locals
()
return
"""
%(z)
s =
%(x)
s < -88.0f ? 0.0 :
%(x)
s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-
%(x)
s));"""
%
locals
()
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
return
"""
%(z)
s =
%(x)
s < -709.0 ? 0.0 :
%(x)
s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-
%(x)
s));"""
%
locals
()
return
"""
%(z)
s =
%(x)
s < -709.0 ? 0.0 :
%(x)
s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-
%(x)
s));"""
%
locals
()
...
@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
...
@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
if
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
:
# These constants were obtained by looking at the output of
# These constants were obtained by looking at the output of python commands like:
# python commands like:
# for i in xrange(750):
# for i in xrange(750):
# print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# print i, repr(numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf
# the boundary checks prevent us from generating inf
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if
(
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
or
node
.
inputs
[
0
]
.
type
==
scalar
.
float16
):
return
"""
%(z)
s =
%(x)
s < -103.0f ? 0.0 :
%(x)
s > 14.0f ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
return
"""
%(z)
s =
%(x)
s < -103.0f ? 0.0 :
%(x)
s > 14.0f ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
return
"""
%(z)
s =
%(x)
s < -745.0 ? 0.0 :
%(x)
s > 16.0 ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
return
"""
%(z)
s =
%(x)
s < -745.0 ? 0.0 :
%(x)
s > 16.0 ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
...
...
theano/tensor/opt.py
浏览文件 @
907d1868
...
@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
...
@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
T
.
log
(
1
-
1
/
(
2
*
x
**
2
)
+
3
/
(
4
*
x
**
4
)
T
.
log
(
1
-
1
/
(
2
*
x
**
2
)
+
3
/
(
4
*
x
**
4
)
-
15
/
(
8
*
x
**
6
)))
-
15
/
(
8
*
x
**
6
)))
if
node
.
outputs
[
0
]
.
dtype
==
'float32'
:
if
(
node
.
outputs
[
0
]
.
dtype
==
'float32'
or
node
.
outputs
[
0
]
.
dtype
==
'float16'
):
threshold
=
10.0541949
threshold
=
10.0541949
elif
node
.
outputs
[
0
]
.
dtype
==
'float64'
:
elif
node
.
outputs
[
0
]
.
dtype
==
'float64'
:
threshold
=
26.641747557
threshold
=
26.641747557
...
@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
...
@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
3
/
(
4
*
(
x
**
4
))
-
15
/
(
8
*
(
x
**
6
)),
-
1
)
3
/
(
4
*
(
x
**
4
))
-
15
/
(
8
*
(
x
**
6
)),
-
1
)
*
T
.
cast
(
T
.
sqrt
(
numpy
.
pi
),
dtype
=
x
.
dtype
))
*
T
.
cast
(
T
.
sqrt
(
numpy
.
pi
),
dtype
=
x
.
dtype
))
if
x
.
dtype
==
'float32'
:
if
x
.
dtype
==
'float32'
or
x
.
dtype
==
'float16'
:
threshold
=
9.3
threshold
=
9.3
#threshold = 10.1
#threshold = 10.1
elif
x
.
dtype
==
'float64'
:
elif
x
.
dtype
==
'float64'
:
...
...
theano/tensor/subtensor.py
浏览文件 @
907d1868
...
@@ -291,6 +291,7 @@ class Subtensor(Op):
...
@@ -291,6 +291,7 @@ class Subtensor(Op):
debug
=
0
debug
=
0
check_input
=
False
check_input
=
False
view_map
=
{
0
:
[
0
]}
view_map
=
{
0
:
[
0
]}
_f16_ok
=
True
@staticmethod
@staticmethod
def
collapse
(
idxs
,
cond
):
def
collapse
(
idxs
,
cond
):
...
@@ -328,7 +329,7 @@ class Subtensor(Op):
...
@@ -328,7 +329,7 @@ class Subtensor(Op):
TODO: WRITEME: This method also accepts "entry" already being a Type;
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
when would that happen?
"""
"""
invalid_scal_types
=
[
scal
.
float64
,
scal
.
float32
]
invalid_scal_types
=
[
scal
.
float64
,
scal
.
float32
,
scal
.
float16
]
scal_types
=
[
scal
.
int64
,
scal
.
int32
,
scal
.
int16
,
scal
.
int8
]
scal_types
=
[
scal
.
int64
,
scal
.
int32
,
scal
.
int16
,
scal
.
int8
]
tensor_types
=
[
theano
.
tensor
.
lscalar
,
theano
.
tensor
.
iscalar
,
tensor_types
=
[
theano
.
tensor
.
lscalar
,
theano
.
tensor
.
iscalar
,
theano
.
tensor
.
wscalar
,
theano
.
tensor
.
bscalar
]
theano
.
tensor
.
wscalar
,
theano
.
tensor
.
bscalar
]
...
@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
...
@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
# sparse_grad doesn't go in here since it only affects the output
# sparse_grad doesn't go in here since it only affects the output
# of the grad() method.
# of the grad() method.
__props__
=
()
__props__
=
()
_f16_ok
=
True
def
__init__
(
self
,
sparse_grad
=
False
):
def
__init__
(
self
,
sparse_grad
=
False
):
self
.
sparse_grad
=
sparse_grad
self
.
sparse_grad
=
sparse_grad
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论