Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
907d1868
提交
907d1868
authored
5月 19, 2015
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2901 from abergeron/f16_lstm
Work to make the DLT LSTM work with float16.
上级
e06617bd
6cee82db
全部展开
隐藏空白字符变更
内嵌
并排
正在显示
12 个修改的文件
包含
229 行增加
和
173 行删除
+229
-173
debugmode.py
theano/compile/debugmode.py
+13
-34
ops.py
theano/compile/ops.py
+1
-0
op.py
theano/gof/op.py
+74
-64
kernel_codegen.py
theano/sandbox/gpuarray/kernel_codegen.py
+37
-25
nnet.py
theano/sandbox/gpuarray/nnet.py
+0
-0
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+33
-7
type.py
theano/sandbox/gpuarray/type.py
+13
-0
rng_mrg.py
theano/sandbox/rng_mrg.py
+25
-25
basic.py
theano/tensor/basic.py
+1
-0
sigm.py
theano/tensor/nnet/sigm.py
+26
-15
opt.py
theano/tensor/opt.py
+3
-2
subtensor.py
theano/tensor/subtensor.py
+3
-1
没有找到文件。
theano/compile/debugmode.py
浏览文件 @
907d1868
...
...
@@ -1685,16 +1685,18 @@ class _Linker(gof.link.LocalLinker):
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
compute_map
=
{}
for
k
in
node
.
inputs
:
compute_map
[
k
]
=
[
True
]
for
k
in
node
.
outputs
:
compute_map
[
k
]
=
[
False
]
# Some Ops define a make_thunk with the expectation that
# it will be called before the C code is compiled, because
# the compilation of some dependency is triggered there.
thunk_other
=
None
if
get_unbound_function
(
node
.
op
.
make_thunk
)
not
in
default_make_thunk
:
compute_map
=
{}
for
k
in
node
.
inputs
:
compute_map
[
k
]
=
[
True
]
for
k
in
node
.
outputs
:
compute_map
[
k
]
=
[
False
]
thunk
=
node
.
op
.
make_thunk
(
node
,
storage_map
,
compute_map
,
...
...
@@ -1708,24 +1710,13 @@ class _Linker(gof.link.LocalLinker):
raise
utils
.
MethodNotDefined
()
# Ops that do not inherit from gof.op.Op don't have certain
# methods defined that the CLinker expects (Scan is an
# ex
ma
ple, ifelse is another of such classes that inherit
# ex
am
ple, ifelse is another of such classes that inherit
# directly from PureOp)
if
not
isinstance
(
node
.
op
,
gof
.
op
.
Op
):
raise
utils
.
MethodNotDefined
()
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
# The toposort isn't a stochastic order as it contain only one node.
e
.
toposort
=
lambda
:
list
(
e
.
apply_nodes
)
# Specifically... e.nodes is a set, but of only 1 element
cl
=
CLinker
()
.
accept
(
e
,
[
r
for
r
,
r2
in
zip
(
e
.
outputs
,
node
.
outputs
)
if
r2
in
no_recycling
])
thunk
,
node_input_filters
,
node_output_filters
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
thunk
.
inputs
=
node_input_storage
thunk
.
outputs
=
node_output_storage
thunk
=
node
.
op
.
make_c_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
thunks_c
.
append
(
thunk
)
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
thunks_c
.
append
(
None
)
...
...
@@ -1735,20 +1726,8 @@ class _Linker(gof.link.LocalLinker):
# consider that we don't have a python implementation
if
((
self
.
maker
.
mode
.
check_py_code
or
thunks_c
[
-
1
]
is
None
)
and
node
.
op
.
perform
.
func_code
!=
gof
.
op
.
PureOp
.
perform
.
func_code
):
p
=
node
.
op
.
perform
ctx
=
node
.
run_context
()
if
ctx
is
graph
.
NoContext
:
thunk
=
(
lambda
p
=
p
,
i
=
node_input_storage
,
o
=
node_output_storage
,
n
=
node
:
p
(
n
,
[
x
[
0
]
for
x
in
i
],
o
))
else
:
ctx_val
=
node
.
context_type
.
filter
(
ctx
)
thunk
=
(
lambda
p
=
p
,
i
=
node_input_storage
,
o
=
node_output_storage
,
ctx
=
ctx_val
,
n
=
node
:
p
(
n
,
[
x
[
0
]
for
x
in
i
],
o
,
ctx
))
thunk
.
inputs
=
node_input_storage
thunk
.
outputs
=
node_output_storage
thunk
.
perform
=
p
thunk
=
node
.
op
.
make_py_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
thunks_py
.
append
(
thunk
)
else
:
thunks_py
.
append
(
None
)
...
...
theano/compile/ops.py
浏览文件 @
907d1868
...
...
@@ -602,6 +602,7 @@ class Rebroadcast(gof.Op):
..note: works inplace and works for CudaNdarrayType
"""
view_map
=
{
0
:
[
0
]}
_f16_ok
=
True
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
...
...
theano/gof/op.py
浏览文件 @
907d1868
...
...
@@ -699,78 +699,55 @@ class Op(utils.object2, PureOp, CLinkerOp):
else
:
return
NotImplemented
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
def
make_
c_
thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
Like make_thunk, but will only try to make a C thunk.
"""
logger
=
logging
.
getLogger
(
'theano.gof.op.Op'
)
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
node_input_compute
=
[
compute_map
[
r
]
for
r
in
node
.
inputs
]
node_output_compute
=
[
compute_map
[
r
]
for
r
in
node
.
outputs
]
if
self
.
_op_use_c_code
:
try
:
# float16 get special treatment since running
# unprepared C code will get bad results.
if
not
getattr
(
self
,
'_f16_ok'
,
False
):
def
is_f16
(
t
):
return
getattr
(
t
,
'dtype'
,
''
)
==
'float16'
if
(
any
(
is_f16
(
i
.
type
)
for
i
in
node
.
inputs
)
or
any
(
is_f16
(
o
.
type
)
for
o
in
node
.
outputs
)):
print
(
"Disabling C code for
%
s due to unsupported "
"float16"
%
(
self
,))
raise
NotImplementedError
(
"float16"
)
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
e_no_recycling
=
[
new_o
for
(
new_o
,
old_o
)
in
zip
(
e
.
outputs
,
node
.
outputs
)
if
old_o
in
no_recycling
]
cl
=
theano
.
gof
.
cc
.
CLinker
()
.
accept
(
e
,
no_recycling
=
e_no_recycling
)
logger
.
debug
(
'Trying CLinker.make_thunk'
)
outputs
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
fill_storage
,
node_input_filters
,
node_output_filters
=
outputs
def
rval
():
fill_storage
()
for
o
in
node
.
outputs
:
compute_map
[
o
][
0
]
=
True
rval
.
cthunk
=
fill_storage
.
cthunk
rval
.
inputs
=
node_input_storage
rval
.
outputs
=
node_output_storage
rval
.
lazy
=
False
return
rval
# the next line does nothing, but pyflakes is too
# stupid to realize the def rval below is not a
# redefinition unless I include this
del
rval
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
logger
.
debug
(
'Falling back on perform'
)
# float16 gets special treatment since running
# unprepared C code will get bad results.
if
not
getattr
(
self
,
'_f16_ok'
,
False
):
def
is_f16
(
t
):
return
getattr
(
t
,
'dtype'
,
''
)
==
'float16'
if
(
any
(
is_f16
(
i
.
type
)
for
i
in
node
.
inputs
)
or
any
(
is_f16
(
o
.
type
)
for
o
in
node
.
outputs
)):
print
(
"Disabling C code for
%
s due to unsupported "
"float16"
%
(
self
,))
raise
NotImplementedError
(
"float16"
)
e
=
FunctionGraph
(
node
.
inputs
,
node
.
outputs
)
e_no_recycling
=
[
new_o
for
(
new_o
,
old_o
)
in
zip
(
e
.
outputs
,
node
.
outputs
)
if
old_o
in
no_recycling
]
cl
=
theano
.
gof
.
cc
.
CLinker
()
.
accept
(
e
,
no_recycling
=
e_no_recycling
)
logger
.
debug
(
'Trying CLinker.make_thunk'
)
outputs
=
cl
.
make_thunk
(
input_storage
=
node_input_storage
,
output_storage
=
node_output_storage
)
fill_storage
,
node_input_filters
,
node_output_filters
=
outputs
def
rval
():
fill_storage
()
for
o
in
node
.
outputs
:
compute_map
[
o
][
0
]
=
True
rval
.
cthunk
=
fill_storage
.
cthunk
rval
.
inputs
=
node_input_storage
rval
.
outputs
=
node_output_storage
rval
.
lazy
=
False
return
rval
# condition: either there was no c_code, or it failed
def
make_py_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
Like make_thunk() but only makes python thunks.
"""
node_input_storage
=
[
storage_map
[
r
]
for
r
in
node
.
inputs
]
node_output_storage
=
[
storage_map
[
r
]
for
r
in
node
.
outputs
]
p
=
node
.
op
.
perform
...
...
@@ -798,6 +775,39 @@ class Op(utils.object2, PureOp, CLinkerOp):
rval
.
lazy
=
False
return
rval
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
"""
:param node: something previously returned by self.make_node
:param storage_map: dict variable -> one-element-list where a computed
value for this variable may be found.
:param compute_map: dict variable -> one-element-list where a boolean
value will be found. The boolean indicates whether the
variable's storage_map container contains a valid value (True)
or if it has not been computed yet (False).
:param no_recycling: list of variables for which it is forbidden to
reuse memory allocated by a previous call.
:note: If the thunk consults the storage_map on every call, it is safe
for it to ignore the no_recycling argument, because elements of the
no_recycling list will have a value of None in the storage map. If
the thunk can potentially cache return values (like CLinker does),
then it must not do so for variables in the no_recycling list.
"""
logger
=
logging
.
getLogger
(
'theano.gof.op.Op'
)
if
self
.
_op_use_c_code
:
try
:
return
self
.
make_c_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
except
(
NotImplementedError
,
utils
.
MethodNotDefined
):
logger
.
debug
(
'Falling back on perform'
)
# condition: either there was no c_code, or it failed
return
self
.
make_py_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
def
get_test_value
(
v
):
"""
...
...
theano/sandbox/gpuarray/kernel_codegen.py
浏览文件 @
907d1868
...
...
@@ -165,18 +165,22 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
]
@code_version
((
1
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
@code_version
((
2
,))
def
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
manner_fn
,
manner_init
,
b
=
''
,
stride_b
=
''
,
dtype
=
'float32'
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
"""Return C++ code for a function that reduces a contiguous buffer.
:param N: length of the buffer
:param buf: buffer pointer of size warpSize * sizeof(dtype)
:param x: input data
:param stride_x: input data stride
:param load_x: wrapper to read from x
:param pos: index of executing thread
:param count: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the output
:param manner_fn: a function that accepts strings of arguments a
...
...
@@ -193,15 +197,15 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
"""
if
b
:
init
=
manner_init
(
"
%(
x)
s[
%(pos)
s *
%(stride_x)
s]
+"
"
%(
b)
s[
%(pos)
s *
%(stride_b)
s]
"
%
locals
())
init
=
manner_init
(
"
%(
load_x)
s(
%(x)
s[
%(pos)
s *
%(stride_x)
s])
+"
"
%(
load_b)
s(
%(b)
s[
%(pos)
s *
%(stride_b)
s])
"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
x)
s[i *
%(stride_x)
s]
+ "
"
%(
b)
s[i *
%(stride_b)
s]
"
%
manner_init
(
"
%(
load_x)
s(
%(x)
s[i *
%(stride_x)
s])
+ "
"
%(
load_b)
s(
%(b)
s[i *
%(stride_b)
s])
"
%
locals
()))
else
:
init
=
manner_init
(
"
%(
x)
s[
%(pos)
s *
%(stride_x)
s]
"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
x)
s[i *
%(stride_x)
s]
"
%
init
=
manner_init
(
"
%(
load_x)
s(
%(x)
s[
%(pos)
s *
%(stride_x)
s])
"
%
locals
())
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(
load_x)
s(
%(x)
s[i *
%(stride_x)
s])
"
%
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
...
...
@@ -248,32 +252,37 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
@code_version
(
inline_reduce_fixed_shared
.
code_version
)
def
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
b
=
''
,
stride_b
=
''
,
dtype
=
'float32'
):
return
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
pos
,
count
,
def
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
'float32'
):
return
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
pos
,
count
,
lambda
a
,
b
:
"max(
%
s,
%
s)"
%
(
a
,
b
),
lambda
a
:
a
,
b
,
stride_b
,
dtype
)
b
,
stride_b
,
load_b
,
dtype
)
@code_version
((
1
,)
+
inline_reduce_max
.
code_version
+
@code_version
((
2
,)
+
inline_reduce_max
.
code_version
+
inline_reduce_sum
.
code_version
)
def
inline_softmax_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
sm
,
sm_stride
,
def
inline_softmax_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
sm
,
sm_stride
,
write_sm
,
threadPos
,
threadCount
,
b
=
''
,
stride_b
=
''
,
dtype
=
"float32"
):
b
=
''
,
stride_b
=
''
,
load_b
=
''
,
dtype
=
"float32"
):
"""
:param N: length of the buffer, atleast waprSize(32).
:param buf: a shared memory buffer of size warpSize * sizeof(dtype)
:param x: a ptr to the gpu memory where the row is stored
:param stride_x: the stride between each element in x
:param load_x: wrapper to read from x
:param sm: a ptr to the gpu memory to store the result
:param sm_stride: the stride between eash sm element
:param write_sm: wrapper before writing to sm
:param threadPos: index of executing thread
:param threadCount: number of executing threads
:param b: Optional, pointer to the bias
:param stride_b: Optional, the stride of b if b is provided
:param load_b: Optional, wrapper to read from b if b is provided
:param dtype: Optional, the dtype of the softmax's output if not float32
:Precondition: buf is empty
...
...
@@ -286,16 +295,18 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
"""
ret
=
[
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
threadPos
,
threadCount
,
b
,
stride_b
,
inline_reduce_fixed_shared_max
(
N
,
buf
,
x
,
stride_x
,
load_x
,
threadPos
,
threadCount
,
b
,
stride_b
,
load_b
,
dtype
),
'__syncthreads()'
,
(
'npy_
%
s row_max = '
+
buf
+
'[0]'
)
%
dtype
,
'__syncthreads()'
,
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
threadPos
,
threadCount
,
inline_reduce_fixed_shared
(
N
,
buf
,
x
,
stride_x
,
load_x
,
threadPos
,
threadCount
,
lambda
a
,
b
:
"
%
s +
%
s"
%
(
a
,
b
),
lambda
a
:
"exp(
%
s - row_max)"
%
a
,
b
,
stride_b
,
dtype
),
b
,
stride_b
,
load_b
,
dtype
),
'__syncthreads()'
,
(
'npy_
%
s row_sum = '
+
buf
+
'[0]'
)
%
dtype
,
'__syncthreads()'
,
...
...
@@ -305,13 +316,14 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x,
if
b
:
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"
exp(
%(x)
s[tx *
%(stride_x)
s]
+"
"
%(
b)
s[tx *
%(stride_b)
s]
- row_max)"
" / row_sum"
%
locals
()]
"
%(write_sm)
s(exp(
%(load_x)
s(
%(x)
s[tx *
%(stride_x)
s])
+"
"
%(
load_b)
s(
%(b)
s[tx *
%(stride_b)
s])
- row_max)"
" / row_sum
)
"
%
locals
()]
else
:
ret
+=
[
"
%(sm)
s[tx *
%(sm_stride)
s] = "
"exp(
%(x)
s[tx *
%(stride_x)
s] - row_max) / row_sum"
%
locals
()]
"
%(write_sm)
s(exp(
%(load_x)
s(
%(x)
s[tx *
%(stride_x)
s]) - row_max)"
" / row_sum)"
%
locals
()]
ret
+=
[
"}"
,
'__syncthreads()'
,
...
...
theano/sandbox/gpuarray/nnet.py
浏览文件 @
907d1868
差异被折叠。
点击展开。
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
907d1868
...
...
@@ -169,6 +169,10 @@ class GpuIncSubtensor(IncSubtensor):
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
@property
def
_f16_ok
(
self
):
return
self
.
iadd_node
.
op
.
_f16_ok
def
c_headers
(
self
):
return
self
.
iadd_node
.
op
.
c_headers
()
...
...
@@ -325,7 +329,6 @@ class GpuIncSubtensor(IncSubtensor):
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
"""
%
locals
()
# def c_code(self, node, name, inputs, outputs, sub):
inputs
=
[
"dst"
,
"src"
]
outputs
=
[
"ret"
]
sub
=
{
"fail"
:
"return NULL;"
}
...
...
@@ -337,7 +340,6 @@ class GpuIncSubtensor(IncSubtensor):
return
ret
def
add_to_zview
(
self
,
nodename
,
x
,
fail
):
# TODO
return
"""
PyGpuArrayObject * add_result = inc_sub_iadd_
%(nodename)
s(zview,
%(x)
s);
...
...
@@ -357,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
if
not
parent_version
or
not
elemwise_version
:
return
return
parent_version
+
elemwise_version
+
(
1
,)
return
parent_version
+
elemwise_version
+
(
2
,)
class
GpuAdvancedIncSubtensor1
(
HideC
,
tensor
.
AdvancedIncSubtensor1
):
...
...
@@ -391,6 +393,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
return
gof
.
Apply
(
self
,
[
x_
,
y_
,
ilist_
],
[
x_
.
type
()])
def
getInplElemwiseAdditionKernel
(
self
,
a
,
b
):
if
a
.
dtype
==
'float16'
or
b
.
dtype
==
'float16'
:
raise
NotImplementedError
(
'float16 is not supported by pygpu '
'elemwise'
)
a_arg
=
pygpu
.
tools
.
as_argument
(
a
,
'a'
)
b_arg
=
pygpu
.
tools
.
as_argument
(
b
,
'b'
)
args
=
[
a_arg
,
b_arg
]
...
...
@@ -452,10 +457,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
"""Implement AdvancedIncSubtensor1 on the gpu, but use function
only avail on compute capability 2.0 and more recent.
"""
def
__init__
(
self
,
inplace
=
False
,
set_instead_of_inc
=
False
):
# The python implementation in the parent class is not applicable here
GpuAdvancedIncSubtensor1
.
__init__
(
self
,
inplace
,
set_instead_of_inc
)
_f16_ok
=
True
def
make_node
(
self
,
x
,
y
,
ilist
):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
...
...
@@ -542,6 +544,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
itemsize_out
=
numpy
.
dtype
(
dtype_out
)
.
itemsize
return
"""
/*
* This is a version of atomicAdd that works for half-floats. It may
* read and write 2 bytes more than the size of the array if the array
* has an uneven number of elements. The actual value at that spot
* will not be modified.
*/
__device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
npy_uint32 *base = (npy_uint32 *)((size_t)addr & ~2);
npy_uint32 old, assumed, sum, new_;
old = *base;
do {
assumed = old;
sum = __float2half_rn(
__half2float(val) +
__half2float((npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((size_t)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_);
} while (assumed != old);
return (npy_float16)__byte_perm(old, 0,
((size_t)addr & 2) ? 0x4432 : 0x4410);
}
__global__ void k_vector_add_fast(int numRowsX,
int numColsX,
int stridesX0,
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
907d1868
...
...
@@ -136,6 +136,12 @@ class GpuArrayType(Type):
raise
NotImplementedError
(
"GpuArrayType.values_eq_approx() don't implemented the"
" allow_remove_inf and allow_remove_nan parameter"
)
if
a
.
dtype
==
'float16'
or
b
.
dtype
==
'float16'
:
an
=
numpy
.
asarray
(
a
)
bn
=
numpy
.
asarray
(
b
)
return
tensor
.
TensorType
.
values_eq_approx
(
an
,
bn
,
allow_remove_inf
=
allow_remove_inf
,
allow_remove_nan
=
allow_remove_nan
,
rtol
=
rtol
,
atol
=
atol
)
narrow
=
'float32'
,
'complex64'
if
(
str
(
a
.
dtype
)
in
narrow
)
or
(
str
(
b
.
dtype
)
in
narrow
):
atol_
=
theano
.
tensor
.
basic
.
float32_atol
...
...
@@ -153,6 +159,13 @@ class GpuArrayType(Type):
locals
())
return
numpy
.
asarray
(
res
)
.
all
()
@staticmethod
def
may_share_memory
(
a
,
b
):
if
(
not
isinstance
(
a
,
gpuarray
.
GpuArray
)
or
not
isinstance
(
b
,
gpuarray
.
GpuArray
)):
return
False
return
pygpu
.
gpuarray
.
may_share_memory
(
a
,
b
)
def
value_zeros
(
self
,
shape
):
return
pygpu
.
gpuarray
.
zeros
(
shape
,
dtype
=
self
.
typecode
)
...
...
theano/sandbox/rng_mrg.py
浏览文件 @
907d1868
...
...
@@ -28,6 +28,7 @@ if cuda_available:
from
theano.sandbox.gpuarray.basic_ops
import
GpuKernelBase
,
Kernel
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.fp16_help
import
write_w
def
matVecModM
(
A
,
s
,
m
):
...
...
@@ -340,15 +341,6 @@ class mrg_uniform(mrg_uniform_base):
def
perform
(
self
,
node
,
inp
,
out
):
rstate
,
size
=
inp
o_rstate
,
o_sample
=
out
numpy_version
=
numpy
.
__version__
.
split
(
'.'
)
if
(
not
self
.
warned_numpy_version
and
int
(
numpy_version
[
0
])
<=
1
and
int
(
numpy_version
[
1
])
<
3
):
print
(
"Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
)
self
.
warned_numpy_version
=
True
n_elements
=
1
rstate
=
numpy
.
asarray
(
rstate
)
# bring state from GPU if necessary
...
...
@@ -377,6 +369,10 @@ class mrg_uniform(mrg_uniform_base):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
rstate
,
size
=
inp
# If we try to use the C code here with something else than a
# TensorType, something is wrong (likely one of the GPU ops
# not defining C code correctly).
assert
isinstance
(
node
.
inputs
[
0
]
.
type
,
TensorType
)
o_rstate
,
o_sample
=
out
if
self
.
inplace
:
o_rstate_requirement
=
'NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_ALIGNED'
...
...
@@ -777,6 +773,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
class
GPUA_mrg_uniform
(
GpuKernelBase
,
mrg_uniform_base
):
# GpuArray version
_f16_ok
=
True
@classmethod
def
new
(
cls
,
rstate
,
ndim
,
dtype
,
size
):
...
...
@@ -790,14 +787,27 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
return
super
(
GPUA_mrg_uniform
,
self
)
.
c_headers
()
+
[
'numpy_compat.h'
]
def
gpu_kernels
(
self
,
node
,
name
):
if
self
.
output_type
.
dtype
==
'float32'
:
write
=
write_w
(
self
.
output_type
.
dtype
)
if
self
.
output_type
.
dtype
==
'float16'
:
otype
=
'ga_half'
# limit the values of the state that we use.
mask
=
'& 0x7fff'
NORM
=
'3.0518e-05f'
# numpy.float16(1.0/(2**15+8))
# this was determined by finding the biggest number such that
# numpy.float16(number * (M1 & 0x7fff)) < 1.0
elif
self
.
output_type
.
dtype
==
'float32'
:
otype
=
'float'
mask
=
''
NORM
=
'4.6566126e-10f'
# numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
el
se
:
el
if
self
.
output_type
.
dtype
==
'float64'
:
otype
=
'double'
mask
=
''
NORM
=
'4.656612873077392578125e-10'
else
:
raise
ValueError
(
'Unsupported data type for output'
,
self
.
output_type
.
dtype
)
code
=
"""
KERNEL void mrg_uniform(
GLOBAL_MEM
%(otype)
s *sample_data,
...
...
@@ -860,11 +870,11 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
x21 = y2;
if (x11 <= x21) {
sample_data[i] =
(x11 - x21 + M1) *
%(NORM)
s
;
sample_data[i] =
%(write)
s(((x11 - x21 + M1)
%(mask)
s) *
%(NORM)
s)
;
}
else
{
sample_data[i] =
(x11 - x21) *
%(NORM)
s
;
sample_data[i] =
%(write)
s(((x11 - x21)
%(mask)
s) *
%(NORM)
s)
;
}
}
...
...
@@ -896,17 +906,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
o_type_num
=
numpy
.
asarray
(
0
,
dtype
=
self
.
output_type
.
dtype
)
.
dtype
.
num
fail
=
sub
[
'fail'
]
kname
=
self
.
gpu_kernels
(
node
,
nodename
)[
0
]
.
objvar
if
self
.
output_type
.
dtype
==
'float32'
:
otype
=
'float'
otypecode
=
'GA_FLOAT'
else
:
otype
=
'double'
otypecode
=
'GA_DOUBLE'
otypecode
=
str
(
self
.
output_type
.
typecode
)
return
"""
//////// <code generated by mrg_uniform>
size_t odims[
%(ndim)
s];
unsigned int n_elements = 1;
unsigned int n_streams;
...
...
@@ -1003,12 +1005,10 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
%(fail)
s
}
}
//////// </ code generated by mrg_uniform>
"""
%
locals
()
def
c_code_cache_version
(
self
):
return
(
3
,
self
.
GpuKernelBase_version
)
return
(
6
,
self
.
GpuKernelBase_version
)
def
guess_n_streams
(
size
,
warn
=
False
):
...
...
theano/tensor/basic.py
浏览文件 @
907d1868
...
...
@@ -3842,6 +3842,7 @@ class Reshape(Op):
The number of dimensions to which to reshape to (ndim) must be
known at graph build time."""
view_map
=
{
0
:
[
0
]}
# output 0 is potentially aliased to inputs [0]
_f16_ok
=
True
check_input
=
False
...
...
theano/tensor/nnet/sigm.py
浏览文件 @
907d1868
...
...
@@ -58,16 +58,21 @@ class ScalarSigmoid(scalar.UnaryScalarOp):
# We add boundary checks prevent exp from generating inf or
# 0. The reset of the logic always generate 0 or 1 in those
# cases. This is a speed optimization.
# The constants were obtained by looking at the output of python commands like:
"""
import numpy, theano
dt='float32' # or float64
for i in xrange(750):
print i, repr(theano._asarray(1.0, dtype=dt) /
(theano._asarray(1.0, dtype=dt) +
numpy.exp(-theano._asarray([i,-i], dtype=dt))))
"""
if
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
:
# The constants were obtained by looking at the output of
# python commands like:
#
# import numpy, theano
# dt='float32' # or float64
# for i in xrange(750):
# print i, repr(theano._asarray(1.0, dtype=dt) /
# (theano._asarray(1.0, dtype=dt) +
# numpy.exp(-theano._asarray([i,-i], dtype=dt))))
# float16 limits: -11.0, 7.0f
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if
(
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
or
node
.
inputs
[
0
]
.
type
==
scalar
.
float16
):
return
"""
%(z)
s =
%(x)
s < -88.0f ? 0.0 :
%(x)
s > 15.0f ? 1.0f : 1.0f /(1.0f + exp(-
%(x)
s));"""
%
locals
()
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
return
"""
%(z)
s =
%(x)
s < -709.0 ? 0.0 :
%(x)
s > 19.0 ? 1.0 : 1.0 /(1.0+exp(-
%(x)
s));"""
%
locals
()
...
...
@@ -327,11 +332,17 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
z
,
=
out
if
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
:
# These constants were obtained by looking at the output of python commands like:
# for i in xrange(750):
# print i, repr( numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf
# These constants were obtained by looking at the output of
# python commands like:
# for i in xrange(750):
# print i, repr(numpy.log1p(numpy.exp(theano._asarray([i,-i], dtype=dt))))
# the boundary checks prevent us from generating inf
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happend in float32 anyway.
if
(
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
or
node
.
inputs
[
0
]
.
type
==
scalar
.
float16
):
return
"""
%(z)
s =
%(x)
s < -103.0f ? 0.0 :
%(x)
s > 14.0f ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
elif
node
.
inputs
[
0
]
.
type
==
scalar
.
float64
:
return
"""
%(z)
s =
%(x)
s < -745.0 ? 0.0 :
%(x)
s > 16.0 ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
...
...
theano/tensor/opt.py
浏览文件 @
907d1868
...
...
@@ -5151,7 +5151,8 @@ def local_log_erfc(node):
T
.
log
(
1
-
1
/
(
2
*
x
**
2
)
+
3
/
(
4
*
x
**
4
)
-
15
/
(
8
*
x
**
6
)))
if
node
.
outputs
[
0
]
.
dtype
==
'float32'
:
if
(
node
.
outputs
[
0
]
.
dtype
==
'float32'
or
node
.
outputs
[
0
]
.
dtype
==
'float16'
):
threshold
=
10.0541949
elif
node
.
outputs
[
0
]
.
dtype
==
'float64'
:
threshold
=
26.641747557
...
...
@@ -5298,7 +5299,7 @@ def local_grad_log_erfc_neg(node):
3
/
(
4
*
(
x
**
4
))
-
15
/
(
8
*
(
x
**
6
)),
-
1
)
*
T
.
cast
(
T
.
sqrt
(
numpy
.
pi
),
dtype
=
x
.
dtype
))
if
x
.
dtype
==
'float32'
:
if
x
.
dtype
==
'float32'
or
x
.
dtype
==
'float16'
:
threshold
=
9.3
#threshold = 10.1
elif
x
.
dtype
==
'float64'
:
...
...
theano/tensor/subtensor.py
浏览文件 @
907d1868
...
...
@@ -291,6 +291,7 @@ class Subtensor(Op):
debug
=
0
check_input
=
False
view_map
=
{
0
:
[
0
]}
_f16_ok
=
True
@staticmethod
def
collapse
(
idxs
,
cond
):
...
...
@@ -328,7 +329,7 @@ class Subtensor(Op):
TODO: WRITEME: This method also accepts "entry" already being a Type;
when would that happen?
"""
invalid_scal_types
=
[
scal
.
float64
,
scal
.
float32
]
invalid_scal_types
=
[
scal
.
float64
,
scal
.
float32
,
scal
.
float16
]
scal_types
=
[
scal
.
int64
,
scal
.
int32
,
scal
.
int16
,
scal
.
int8
]
tensor_types
=
[
theano
.
tensor
.
lscalar
,
theano
.
tensor
.
iscalar
,
theano
.
tensor
.
wscalar
,
theano
.
tensor
.
bscalar
]
...
...
@@ -1603,6 +1604,7 @@ class AdvancedSubtensor1(Op):
# sparse_grad doesn't go in here since it only affects the output
# of the grad() method.
__props__
=
()
_f16_ok
=
True
def
__init__
(
self
,
sparse_grad
=
False
):
self
.
sparse_grad
=
sparse_grad
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论