Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
37d5f777
提交
37d5f777
authored
10月 07, 2016
作者:
Frédéric Bastien
提交者:
GitHub
10月 07, 2016
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #5049 from abergeron/fix_dlt_f16
Collection of fixes to make the DLT work in float16
上级
1dabf854
22b5da98
隐藏空白字符变更
内嵌
并排
正在显示
13 个修改的文件
包含
157 行增加
和
184 行删除
+157
-184
nanguardmode.py
theano/compile/nanguardmode.py
+41
-1
ops.py
theano/compile/ops.py
+1
-1
configdefaults.py
theano/configdefaults.py
+1
-1
__init__.py
theano/gpuarray/__init__.py
+3
-1
dnn.py
theano/gpuarray/dnn.py
+2
-2
elemwise.py
theano/gpuarray/elemwise.py
+38
-58
nnet.py
theano/gpuarray/nnet.py
+27
-98
test_dnn.py
theano/gpuarray/tests/test_dnn.py
+19
-0
gradient.py
theano/gradient.py
+2
-2
basic.py
theano/scalar/basic.py
+8
-10
basic.py
theano/tensor/basic.py
+1
-0
sigm.py
theano/tensor/nnet/sigm.py
+1
-1
opt.py
theano/tensor/opt.py
+13
-9
没有找到文件。
theano/compile/nanguardmode.py
浏览文件 @
37d5f777
...
...
@@ -10,6 +10,15 @@ from theano.configparser import config
import
theano.tensor
as
T
import
theano.sandbox.cuda
as
cuda
from
theano.compile
import
Mode
from
.mode
import
get_mode
try
:
from
theano.gpuarray.type
import
GpuArrayType
,
_name_for_ctx
from
pygpu.gpuarray
import
GpuArray
pygpu_available
=
True
except
ImportError
:
pygpu_available
=
False
logger
=
logging
.
getLogger
(
"theano.compile.nanguardmode"
)
...
...
@@ -86,6 +95,8 @@ def contains_nan(arr, node=None, var=None):
else
:
compile_gpu_func
(
True
,
False
,
False
)
return
np
.
isnan
(
f_gpumin
(
arr
.
reshape
(
arr
.
size
)))
elif
pygpu_available
and
isinstance
(
arr
,
GpuArray
):
return
np
.
isnan
(
f_gpua_min
(
arr
.
reshape
(
arr
.
size
)))
return
np
.
isnan
(
np
.
min
(
arr
))
...
...
@@ -136,6 +147,9 @@ def contains_inf(arr, node=None, var=None):
compile_gpu_func
(
False
,
True
,
False
)
return
(
np
.
isinf
(
f_gpumin
(
arr
.
reshape
(
arr
.
size
)))
or
np
.
isinf
(
f_gpumax
(
arr
.
reshape
(
arr
.
size
))))
elif
pygpu_available
and
isinstance
(
arr
,
GpuArray
):
return
(
np
.
isinf
(
f_gpua_min
(
arr
.
reshape
(
arr
.
size
)))
or
np
.
isinf
(
f_gpua_max
(
arr
.
reshape
(
arr
.
size
))))
return
np
.
isinf
(
np
.
nanmax
(
arr
))
or
np
.
isinf
(
np
.
nanmin
(
arr
))
...
...
@@ -187,6 +201,27 @@ def compile_gpu_func(nan_is_error, inf_is_error, big_is_error):
cuda_compile_failed
=
True
def
f_compute
(
op
):
def
result
(
inp
):
dtype
=
inp
.
dtype
ctx_name
=
_name_for_ctx
(
inp
.
context
)
key
=
(
dtype
,
ctx_name
)
f
=
result
.
cache
.
get
(
key
,
None
)
if
f
is
None
:
guard_in
=
GpuArrayType
(
str
(
dtype
),
(
False
,),
context_name
=
ctx_name
)()
mode
=
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
f
=
theano
.
function
([
guard_in
],
op
(
guard_in
),
mode
=
mode
,
profile
=
False
)
result
.
cache
[
key
]
=
f
return
f
(
inp
)
result
.
cache
=
dict
()
return
result
f_gpua_min
=
f_compute
(
T
.
min
)
f_gpua_max
=
f_compute
(
T
.
max
)
f_gpua_absmax
=
f_compute
(
lambda
x
:
T
.
max
(
T
.
abs_
(
x
)))
class
NanGuardMode
(
Mode
):
"""
A Theano compilation Mode that makes the compiled function automatically
...
...
@@ -220,7 +255,9 @@ class NanGuardMode(Mode):
big_is_error
=
config
.
NanGuardMode
.
big_is_error
assert
nan_is_error
or
inf_is_error
or
big_is_error
compile_gpu_func
(
nan_is_error
,
inf_is_error
,
big_is_error
)
if
cuda
.
cuda_enabled
:
compile_gpu_func
(
nan_is_error
,
inf_is_error
,
big_is_error
)
def
do_check_on
(
value
,
nd
,
var
=
None
):
"""
...
...
@@ -260,7 +297,10 @@ class NanGuardMode(Mode):
elif
value
.
size
==
0
:
err
=
False
elif
cuda
.
cuda_available
and
isinstance
(
value
,
cuda
.
CudaNdarray
):
compile_gpu_func
(
False
,
False
,
True
)
err
=
(
f_gpuabsmax
(
value
.
reshape
(
value
.
size
))
>
1e10
)
elif
pygpu_available
and
isinstance
(
value
,
GpuArray
):
err
=
(
f_gpua_absmax
(
value
.
reshape
(
value
.
size
))
>
1e10
)
else
:
err
=
(
np
.
abs
(
value
)
.
max
()
>
1e10
)
if
err
:
...
...
theano/compile/ops.py
浏览文件 @
37d5f777
...
...
@@ -445,7 +445,7 @@ def shape_i(var, i, fgraph=None):
shape_of
=
shape_feature
.
shape_of
def
recur
(
node
):
if
not
hasattr
(
node
.
outputs
[
0
],
'fgraph'
)
:
if
not
node
.
outputs
[
0
]
in
shape_of
:
for
inp
in
node
.
inputs
:
if
inp
.
owner
:
recur
(
inp
.
owner
)
...
...
theano/configdefaults.py
浏览文件 @
37d5f777
...
...
@@ -446,7 +446,7 @@ if param and os.name == 'nt':
def
warn_cxx
(
val
):
"""We only support clang++ as otherwise we hit strange g++/OSX bugs."""
if
sys
.
platform
==
'darwin'
and
val
!=
'clang++'
:
if
sys
.
platform
==
'darwin'
and
'clang++'
not
in
val
:
_logger
.
warning
(
"Only clang++ is supported. With g++,"
" we end up with strange g++/OSX bugs."
)
return
True
...
...
theano/gpuarray/__init__.py
浏览文件 @
37d5f777
...
...
@@ -66,7 +66,9 @@ def init_dev(dev, name=None):
single_stream
=
config
.
gpuarray
.
single_stream
,
sched
=
config
.
gpuarray
.
sched
)
init_dev
.
devmap
[
dev
]
=
ctx
if
config
.
gpuarray
.
preallocate
>
0
:
if
config
.
gpuarray
.
preallocate
<
0
:
print
(
"Disabling allocation cache on
%
s"
%
(
dev
,))
elif
config
.
gpuarray
.
preallocate
>
0
:
MB
=
(
1024
*
1024
)
if
config
.
gpuarray
.
preallocate
<=
1
:
gmem
=
min
(
config
.
gpuarray
.
preallocate
,
0.95
)
*
ctx
.
total_gmem
...
...
theano/gpuarray/dnn.py
浏览文件 @
37d5f777
...
...
@@ -1319,8 +1319,6 @@ class GpuDnnSoftmaxBase(DnnBase):
DnnBase
.
__init__
(
self
,
[
self
.
file
],
self
.
c_func
)
assert
(
algo
in
(
'fast'
,
'accurate'
,
'log'
))
if
algo
==
'log'
and
version
(
raises
=
False
)
<
3000
:
raise
RuntimeError
(
"Need cuDNN v3 for log-softmax"
)
self
.
algo
=
algo
assert
(
mode
in
(
'instance'
,
'channel'
))
...
...
@@ -1361,6 +1359,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
or per spatial location '01' per image across 'c'.
"""
_f16_ok
=
True
direction
=
"forward"
file
=
"dnn_softmax.c"
c_func
=
"APPLY_SPECIFIC(softmax)"
...
...
@@ -1397,6 +1396,7 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
image across 'c'.
"""
_f16_ok
=
True
direction
=
'backward'
file
=
"dnn_softmax_grad.c"
c_func
=
"APPLY_SPECIFIC(softmax_grad)"
...
...
theano/gpuarray/elemwise.py
浏览文件 @
37d5f777
...
...
@@ -33,6 +33,12 @@ def as_C_string_const(s):
for
l
in
s
.
split
(
'
\n
'
))
def
get_scal
(
dt
):
if
dt
==
'float16'
:
dt
=
'float32'
return
scalar
.
get_scalar_type
(
dt
)
class
GpuElemwise
(
HideC
,
Elemwise
):
"""
Elemwise on the GPU.
...
...
@@ -60,23 +66,18 @@ class GpuElemwise(HideC, Elemwise):
zip
(
out_info
[
0
],
out_info
[
1
])]
if
len
(
outputs
)
>
1
:
raise
NotImplementedError
()
node
=
Apply
(
self
,
inputs
,
outputs
)
# Try to generate the kernel to catch SupportCodeErrors
scal_ins
=
[
get_scal
(
i
.
dtype
)
for
i
in
inputs
]
fake_node
=
self
.
scalar_op
.
make_node
(
*
[
i
()
for
i
in
scal_ins
])
try
:
scal_ins
=
[
scalar
.
get_scalar_type
(
i
.
dtype
)
for
i
in
node
.
inputs
]
scal_out
=
[
scalar
.
get_scalar_type
(
o
.
dtype
)
for
o
in
node
.
outputs
]
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_ins
],
[
o
()
for
o
in
scal_out
])
code
=
self
.
scalar_op
.
c_support_code_apply
(
fake_node
,
"test"
)
code
=
fake_node
.
op
.
c_support_code_apply
(
fake_node
,
"test"
)
if
code
:
raise
SupportCodeError
(
code
)
except
MethodNotDefined
:
pass
try
:
support_code
=
self
.
scalar_
op
.
c_support_code
()
support_code
=
fake_node
.
op
.
c_support_code
()
if
"struct"
in
support_code
:
# The macro is fine, the C++ struct is not.
raise
SupportCodeError
(
...
...
@@ -85,6 +86,15 @@ class GpuElemwise(HideC, Elemwise):
except
MethodNotDefined
:
pass
if
fake_node
.
op
!=
self
.
scalar_op
:
# If the new op is different due to type changes, we make a new
# op for it.
elem
=
GpuElemwise
(
fake_node
.
op
,
self
.
inplace_pattern
,
self
.
name
,
self
.
nfunc_spec
,
self
.
openmp
)
else
:
elem
=
self
node
=
Apply
(
elem
,
inputs
,
outputs
)
return
node
def
get_params
(
self
,
node
):
...
...
@@ -92,59 +102,31 @@ class GpuElemwise(HideC, Elemwise):
def
_get_vnames
(
self
,
node
):
inps
=
[
'i
%
d'
%
(
n
,)
for
n
,
_
in
enumerate
(
node
.
inputs
)]
outs
=
[
'o
%
d'
%
(
n
,)
for
n
,
_
in
enumerate
(
node
.
outputs
)
if
n
not
in
self
.
inplace_pattern
]
outs
=
[
'o
%
d'
%
(
n
,)
if
n
not
in
self
.
inplace_pattern
else
inps
[
self
.
inplace_pattern
[
n
]]
for
n
,
_
in
enumerate
(
node
.
outputs
)]
return
inps
,
outs
def
_generate_op_string
(
self
,
node
):
scal_v_ins
=
[
scalar
.
get_scalar_type
(
i
.
dtype
)
for
i
in
node
.
inputs
]
scal_v_outs
=
[
scalar
.
get_scalar_type
(
o
.
dtype
)
for
o
in
node
.
outputs
]
inps
,
outs
=
self
.
_get_vnames
(
node
)
scal_v_ins
=
[
get_scal
(
i
.
dtype
)()
for
i
in
node
.
inputs
]
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_v_ins
],
[
o
()
for
o
in
scal_v_outs
])
fake_node
=
self
.
scalar_op
.
make_node
(
*
scal_v_ins
)
scal_v_out
=
fake_node
.
outputs
assert
len
(
scal_v_out
)
==
len
(
node
.
outputs
)
scal_in
=
[
i
if
si
.
dtype
!=
'float16'
else
'load_half(&'
+
i
+
')'
for
i
,
si
in
zip
(
inps
,
scal_v_ins
)]
kop
=
fake_node
.
op
.
c_code
(
fake_node
,
'elem_scalar'
,
inps
,
outs
,
dict
(
fail
=
'return;'
))
scal_out
=
[]
oi
=
0
scal_f16
=
[]
for
n
in
range
(
len
(
node
.
outputs
)):
if
n
in
self
.
inplace_pattern
:
arg
=
inps
[
self
.
inplace_pattern
[
n
]]
else
:
arg
=
outs
[
oi
]
oi
+=
1
if
node
.
outputs
[
n
]
.
dtype
==
'float16'
:
scal_f16
.
append
((
'tmpf16
%
i'
%
(
len
(
scal_f16
),),
arg
))
scal_out
.
append
(
scal_f16
[
-
1
][
0
])
else
:
scal_out
.
append
(
arg
)
kop
=
self
.
scalar_op
.
c_code
(
fake_node
,
'elem_scalar'
,
scal_in
,
scal_out
,
dict
(
fail
=
'return;'
))
if
scal_f16
:
# if we have float16 scalars on output we have to wrap
# them and insert a stand-in float32 variable since
# float16 arithemtic is not available
code
=
[
"{"
]
for
f
in
scal_f16
:
code
.
append
(
'ga_float
%
s;'
%
(
f
[
0
],))
# XXX: The replace is an ugly hack to make sure temp
# variables inthe middle are float32
code
.
append
(
kop
.
replace
(
'npy_float16'
,
'ga_float'
))
for
f
in
scal_f16
:
code
.
append
(
'store_half(&
%
s,
%
s);'
%
(
f
[
1
],
f
[
0
]))
code
.
append
(
'}'
)
kop
=
'
\n
'
.
join
(
code
)
# Some ops like cast will reintroduce float16 in the internal graph.
kop
=
kop
.
replace
(
'npy_float16'
,
'ga_float'
)
support_code
=
""
try
:
# We accept only some c_support_code().
# This filter is done in the make_node()
support_code
+=
self
.
scalar_
op
.
c_support_code
()
support_code
+=
fake_node
.
op
.
c_support_code
()
except
MethodNotDefined
:
pass
for
npy
,
ga
in
[(
"npy_uint8"
,
"ga_ubyte"
),
...
...
@@ -171,7 +153,7 @@ class GpuElemwise(HideC, Elemwise):
def
c_init_code_struct
(
self
,
node
,
name
,
sub
):
inps
,
outs
=
self
.
_get_vnames
(
node
)
nargs
=
len
(
inps
)
+
len
(
outs
)
nargs
=
len
(
inps
)
+
len
(
outs
)
-
len
(
self
.
inplace_pattern
)
support_code
,
kop
=
self
.
_generate_op_string
(
node
)
res
=
"""
gpuelemwise_arg args[
%(nargs)
s] = {{0}};
...
...
@@ -185,24 +167,22 @@ class GpuElemwise(HideC, Elemwise):
"""
%
dict
(
n
=
n
,
name
=
'"
%
s"'
%
(
name
,),
typecode
=
i
.
type
.
typecode
)
p
=
0
p
=
len
(
inps
)
for
n
,
o
in
enumerate
(
node
.
outputs
):
if
n
in
self
.
inplace_pattern
:
assert
(
len
(
node
.
outputs
)
==
1
)
res
+=
"
\n
args[
%(n)
s].flags |= GE_WRITE;
\n
"
%
dict
(
n
=
self
.
inplace_pattern
[
n
])
else
:
nn
=
len
(
inps
)
+
p
name
=
outs
[
p
]
p
+=
1
res
+=
"""
args[
%(n)
s].name =
%(name)
s;
args[
%(n)
s].typecode =
%(typecode)
s;
args[
%(n)
s].flags = GE_WRITE;
"""
%
dict
(
n
=
nn
,
name
=
'"
%
s"'
%
(
name
,),
"""
%
dict
(
n
=
p
,
name
=
'"
%
s"'
%
(
outs
[
n
]
,),
typecode
=
o
.
type
.
typecode
)
p
+=
1
res
+=
"""
ge = GpuElemwise_new(
%(ctx)
s->ctx,
%(support)
s,
%(kop)
s,
%(nargs)
s, args,
%(nd)
s,
0
);
ge = GpuElemwise_new(
%(ctx)
s->ctx,
%(support)
s,
%(kop)
s,
%(nargs)
s, args,
%(nd)
s,
GE_CONVERT_F16
);
if (ge == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
%(fail)
s
...
...
@@ -363,7 +343,7 @@ class GpuElemwise(HideC, Elemwise):
def
c_code_cache_version
(
self
):
ver
=
self
.
scalar_op
.
c_code_cache_version
()
if
ver
:
return
(
7
,
ver
)
return
(
8
,
ver
)
else
:
return
ver
...
...
theano/gpuarray/nnet.py
浏览文件 @
37d5f777
from
__future__
import
absolute_import
,
print_function
,
division
import
os
import
numpy
from
theano
import
Op
,
Apply
,
config
...
...
@@ -45,7 +46,10 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return
node
.
inputs
[
0
]
.
type
.
context
def
c_headers
(
self
):
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
,
'gpuarray_helper.h'
]
def
c_header_dirs
(
self
):
return
[
os
.
path
.
dirname
(
__file__
)]
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
...
@@ -191,9 +195,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
if
node
.
inputs
[
0
]
.
type
.
context
.
kind
!=
b
'cuda'
:
raise
NotImplementedError
(
'cuda only'
)
typecode_x
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
0
]
.
dtype
)
typecode_b
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
1
]
.
dtype
)
typecode_y_idx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
inputs
[
2
]
.
dtype
)
itemsize_x
=
numpy
.
dtype
(
node
.
inputs
[
0
]
.
dtype
)
.
itemsize
worksize_x
=
numpy
.
dtype
(
work_dtype
(
node
.
inputs
[
0
]
.
dtype
))
.
itemsize
itemsize_b
=
numpy
.
dtype
(
node
.
inputs
[
1
]
.
dtype
)
.
itemsize
...
...
@@ -203,13 +204,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
itemsize_am
=
numpy
.
dtype
(
node
.
outputs
[
2
]
.
dtype
)
.
itemsize
x
,
b
,
y_idx
=
inp
nll
,
sm
,
am
=
out
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_b
=
node
.
inputs
[
1
]
.
dtype
dtype_y_idx
=
node
.
inputs
[
2
]
.
dtype
dtype_nll
=
node
.
outputs
[
0
]
.
dtype
dtype_sm
=
node
.
outputs
[
1
]
.
dtype
dtype_am
=
node
.
outputs
[
2
]
.
dtype
classname
=
self
.
__class__
.
__name__
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'params'
]
k_var
=
"k_xent_sm_1hot_bias_
%(nodename)
s"
%
locals
()
...
...
@@ -229,21 +223,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"""
%
locals
()
sio
=
StringIO
()
print
(
"""
if (PyGpuArray_NDIM(
%(y_idx)
s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)
s;
}
if (PyGpuArray_NDIM(
%(x)
s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)
s;
}
if (PyGpuArray_NDIM(
%(b)
s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)
s;
}
if (PyGpuArray_DIMS(
%(x)
s)[0] !=
PyGpuArray_DIMS(
%(y_idx)
s)[0])
{
...
...
@@ -257,82 +236,32 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
"dimension mismatch in x,b arguments");
%(fail)
s;
}
if ((NULL ==
%(nll)
s) //initial condition
|| (PyGpuArray_DIMS(
%(nll)
s)[0] !=
PyGpuArray_DIMS(
%(y_idx)
s)[0]))
{
Py_XDECREF(
%(nll)
s);
%(nll)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(typecode_x)
s, GA_C_ORDER,
%(ctx)
s,
Py_None);
if (!
%(nll)
s) {
%(fail)
s
}
}
if ((NULL ==
%(sm)
s)
|| (PyGpuArray_DIMS(
%(sm)
s)[0] !=
PyGpuArray_DIMS(
%(x)
s)[0])
|| (PyGpuArray_DIMS(
%(sm)
s)[1] !=
PyGpuArray_DIMS(
%(x)
s)[1]))
{
Py_XDECREF(
%(sm)
s);
%(sm)
s = pygpu_empty(2, PyGpuArray_DIMS(
%(x)
s),
%(typecode_b)
s, GA_C_ORDER,
%(ctx)
s, Py_None);
if(!
%(sm)
s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)
s;
}
}
if ((NULL ==
%(am)
s)
|| (PyGpuArray_DIMS(
%(am)
s)[0] !=
PyGpuArray_DIMS(
%(y_idx)
s)[0]))
{
Py_XDECREF(
%(am)
s);
%(am)
s = pygpu_empty(1, PyGpuArray_DIMS(
%(y_idx)
s),
%(typecode_y_idx)
s, GA_C_ORDER,
%(ctx)
s, Py_None);
if(!
%(am)
s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)
s;
}
}
if (theano_prep_output(&
%(nll)
s, 1, PyGpuArray_DIMS(
%(y_idx)
s),
%(x)
s->ga.typecode, GA_C_ORDER,
%(ctx)
s))
%(fail)
s
if (theano_prep_output(&
%(sm)
s, 2, PyGpuArray_DIMS(
%(x)
s),
%(x)
s->ga.typecode, GA_C_ORDER,
%(ctx)
s))
%(fail)
s
if (theano_prep_output(&
%(am)
s, 1, PyGpuArray_DIMS(
%(y_idx)
s),
%(y_idx)
s->ga.typecode, GA_C_ORDER,
%(ctx)
s))
%(fail)
s
{
size_t n_blocks = std::min(PyGpuArray_DIM(
%(x)
s, 0), (size_t)4096);
size_t n_threads = std::min(PyGpuArray_DIM(
%(x)
s, 1), (size_t)256);
size_t n_shared = n_threads *
%(worksize_x)
s;
ssize_t stride_X0 = PyGpuArray_STRIDES(
%(x)
s)[0] /
%(itemsize_x)
s;
ssize_t stride_X1 = PyGpuArray_STRIDES(
%(x)
s)[1] /
%(itemsize_x)
s;
ssize_t stride_B0 = PyGpuArray_STRIDES(
%(b)
s)[0] /
%(itemsize_b)
s;
ssize_t stride_YIDX0 = PyGpuArray_STRIDES(
%(y_idx)
s)[0] /
%(itemsize_y_idx)
s;
ssize_t stride_NLL0 = PyGpuArray_STRIDES(
%(nll)
s)[0] /
%(itemsize_nll)
s;
ssize_t stride_SM0 = PyGpuArray_STRIDES(
%(sm)
s)[0] /
%(itemsize_sm)
s;
ssize_t stride_SM1 = PyGpuArray_STRIDES(
%(sm)
s)[1] /
%(itemsize_sm)
s;
ssize_t stride_AM0 = PyGpuArray_STRIDES(
%(am)
s)[0] /
%(itemsize_am)
s;
//TODO: launch more threads per row and do parallel sum and max reductions
void *kernel_params[] = {
(void *)&PyGpuArray_DIMS(
%(x)
s)[0],
(void *)&PyGpuArray_DIMS(
%(x)
s)[1],
(void *)
%(x)
s->ga.data, (void *)&
%(x)
s->ga.offset,
(void *)&stride_X0, (void *)&stride_X1,
(void *)
%(b)
s->ga.data, (void *)&
%(b)
s->ga.offset,
(void *)&stride_B0,
(void *)
%(y_idx)
s->ga.data, (void *)&
%(y_idx)
s->ga.offset,
(void *)&stride_YIDX0,
(void *)
%(nll)
s->ga.data, (void *)&
%(nll)
s->ga.offset,
(void *)&stride_NLL0,
(void *)
%(sm)
s->ga.data, (void *)&
%(sm)
s->ga.offset,
(void *)&stride_SM0, (void *)&stride_SM1,
(void *)
%(am)
s->ga.data, (void *)&
%(am)
s->ga.offset,
(void *)&stride_AM0};
int err = GpuKernel_call(&
%(k_var)
s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
int err = k_xent_sm_1hot_bias_call(
1, &n_blocks, &n_threads, n_shared,
PyGpuArray_DIMS(
%(x)
s)[0],
PyGpuArray_DIMS(
%(x)
s)[1],
%(x)
s->ga.data,
%(x)
s->ga.offset,
PyGpuArray_STRIDE(
%(x)
s, 0) /
%(itemsize_x)
s,
PyGpuArray_STRIDE(
%(x)
s, 1) /
%(itemsize_x)
s,
%(b)
s->ga.data,
%(b)
s->ga.offset,
PyGpuArray_STRIDE(
%(b)
s, 0) /
%(itemsize_b)
s,
%(y_idx)
s->ga.data,
%(y_idx)
s->ga.offset,
PyGpuArray_STRIDE(
%(y_idx)
s, 0) /
%(itemsize_y_idx)
s,
%(nll)
s->ga.data,
%(nll)
s->ga.offset,
PyGpuArray_STRIDE(
%(nll)
s, 0) /
%(itemsize_nll)
s,
%(sm)
s->ga.data,
%(sm)
s->ga.offset,
PyGpuArray_STRIDE(
%(sm)
s, 0) /
%(itemsize_sm)
s,
PyGpuArray_STRIDE(
%(sm)
s, 1) /
%(itemsize_sm)
s,
%(am)
s->ga.data,
%(am)
s->ga.offset,
PyGpuArray_STRIDE(
%(am)
s, 0) /
%(itemsize_am)
s);
%(err_check)
s
%(sync)
s
}
...
...
@@ -340,7 +269,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return
sio
.
getvalue
()
def
c_code_cache_version
(
self
):
return
(
1
0
,)
return
(
1
2
,)
gpu_crossentropy_softmax_argmax_1hot_with_bias
=
GpuCrossentropySoftmaxArgmax1HotWithBias
()
...
...
theano/gpuarray/tests/test_dnn.py
浏览文件 @
37d5f777
...
...
@@ -797,6 +797,25 @@ class test_SoftMax(test_nnet.test_SoftMax):
def
test_softmax_shape_0
(
self
):
raise
SkipTest
(
"Cudnn doesn't support 0 shapes"
)
def
test_softmax_f16
(
self
):
x
=
T
.
matrix
(
'x'
,
'float16'
)
x_gpu
=
T
.
tensor4
(
'x_gpu'
,
'float16'
)
f_z
=
T
.
nnet
.
softmax_op
f_gpu
=
dnn
.
GpuDnnSoftmax
(
'accurate'
,
'channel'
)
def
cmp
(
n
,
m
,
f
,
f_gpu
):
data
=
numpy
.
random
.
random
((
n
,
m
))
.
astype
(
'float16'
)
gdata
=
numpy
.
asarray
(
data
)[:,
:,
None
,
None
]
out
=
f
(
data
)
gout
=
numpy
.
asarray
(
f_gpu
(
gdata
))[:,
:,
0
,
0
]
utt
.
assert_allclose
(
out
,
gout
)
self
.
_test_softmax
(
x
,
x_gpu
,
f_z
,
f_gpu
,
cmp
)
def
test_softmax_grad
(
self
):
def
cmp
(
n
,
m
,
f
,
f_gpu
):
data
=
numpy
.
arange
(
n
*
m
,
dtype
=
'float32'
)
.
reshape
(
n
,
m
)
...
...
theano/gradient.py
浏览文件 @
37d5f777
...
...
@@ -1373,10 +1373,10 @@ class numeric_grad(object):
# perfectly accurate.
type_eps
=
{
'float64'
:
1e-7
,
'float32'
:
3e-4
,
'float16'
:
1e-
3
,
'float16'
:
1e-
1
,
numpy
.
dtype
(
'float64'
):
1e-7
,
numpy
.
dtype
(
'float32'
):
3e-4
,
numpy
.
dtype
(
'float16'
):
1e-
3
}
numpy
.
dtype
(
'float16'
):
1e-
1
}
def
__init__
(
self
,
f
,
pt
,
eps
=
None
,
out_type
=
None
):
"""Return the gradient of f at pt.
...
...
theano/scalar/basic.py
浏览文件 @
37d5f777
...
...
@@ -39,7 +39,7 @@ builtin_int = int
builtin_float
=
float
class
ComplexError
(
Exception
):
class
ComplexError
(
NotImplementedError
):
"""
Raised if complex numbers are used in an unsupported operation.
...
...
@@ -2197,7 +2197,7 @@ class Sgn(UnaryScalarOp):
return
'
%(z)
s = (
%(x)
s > 0) ? 1. : ((
%(x)
s < 0) ? -1. : (isnan(
%(x)
s) ? NAN : 0.));'
%
locals
()
if
type
in
int_types
:
return
"
%(z)
s = (
%(x)
s >= 0) ? (
%(x)
s == 0) ? 0 : 1 : -1;"
%
locals
()
raise
TypeError
()
# complex has no sgn
raise
ComplexError
(
'complex has no sgn'
)
def
c_code_cache_version
(
self
):
s
=
super
(
Sgn
,
self
)
.
c_code_cache_version
()
...
...
@@ -2300,7 +2300,7 @@ class RoundHalfToEven(UnaryScalarOp):
(
z
,)
=
outputs
typ
=
node
.
outputs
[
0
]
.
type
.
dtype
if
typ
not
in
[
'float32'
,
'float64'
]:
Exception
(
"The output should be float32 or float64"
)
raise
NotImplementedError
(
"The output should be float32 or float64"
)
return
dedent
(
"""
#ifndef ROUNDING_EPSILON
...
...
@@ -2398,7 +2398,7 @@ class RoundHalfAwayFromZero(UnaryScalarOp):
if
node
.
outputs
[
0
]
.
type
.
dtype
in
[
'float32'
,
'float64'
]:
return
"
%(z)
s = round(
%(x)
s);"
%
locals
()
else
:
Exception
(
"The output should be float32 or float64"
)
raise
NotImplementedError
(
"The output should be float32 or float64"
)
round_half_away_from_zero
=
RoundHalfAwayFromZero
(
same_out_float_only
)
...
...
@@ -3711,8 +3711,7 @@ class Composite(ScalarOp):
raise
NotImplementedError
(
"grad is not implemented for Composite"
)
def
c_code
(
self
,
node
,
nodename
,
inames
,
onames
,
sub
):
if
not
hasattr
(
self
,
'_c_code'
):
self
.
init_c_code
()
self
.
init_c_code
()
d
=
dict
(
chain
(
izip
((
"i
%
i"
%
i
for
i
in
xrange
(
len
(
inames
))),
inames
),
izip
((
"o
%
i"
%
i
for
i
in
xrange
(
len
(
onames
))),
...
...
@@ -3746,6 +3745,7 @@ class Composite(ScalarOp):
return
"
\n
"
.
join
(
sorted
(
set
(
rval
)))
def
c_support_code_apply
(
self
,
node
,
name
):
self
.
init_c_code
()
rval
=
[]
for
subnode
,
subnodename
in
zip
(
self
.
fgraph
.
toposort
(),
self
.
nodenames
):
try
:
...
...
@@ -3771,13 +3771,11 @@ class Composite(ScalarOp):
return
False
# see __hash__ for comment on why there is no mention of fgraph
# or module cache key here.
if
not
hasattr
(
self
,
'_c_code'
):
self
.
init_c_code
()
# self._c_code and self.nodenames
self
.
init_c_code
()
# self._c_code and self.nodenames
return
(
self
.
_c_code
==
other
.
_c_code
)
def
__hash__
(
self
):
if
not
hasattr
(
self
,
'_c_code'
):
self
.
init_c_code
()
# self._c_code and self.nodenames
self
.
init_c_code
()
# self._c_code and self.nodenames
rval
=
hash
((
type
(
self
),
self
.
nin
,
self
.
nout
,
...
...
theano/tensor/basic.py
浏览文件 @
37d5f777
...
...
@@ -2774,6 +2774,7 @@ class Alloc(gof.Op):
are lifted, the first argument to fill can often be pruned from the graph.
"""
_f16_ok
=
True
__props__
=
()
def
validate_shape
(
self
,
shape
):
...
...
theano/tensor/nnet/sigm.py
浏览文件 @
37d5f777
...
...
@@ -352,7 +352,7 @@ class ScalarSoftplus(scalar.UnaryScalarOp):
# float16 limits: -17.0, 6.0
# We use the float32 limits for float16 for now as the
# computation will happen
d
in float32 anyway.
# computation will happen in float32 anyway.
if
(
node
.
inputs
[
0
]
.
type
==
scalar
.
float32
or
node
.
inputs
[
0
]
.
type
==
scalar
.
float16
):
return
"""
%(z)
s =
%(x)
s < -103.0f ? 0.0 :
%(x)
s > 14.0f ?
%(x)
s : log1p(exp(
%(x)
s));"""
%
locals
()
...
...
theano/tensor/opt.py
浏览文件 @
37d5f777
...
...
@@ -2247,7 +2247,7 @@ class Assert(T.Op):
>>> func = theano.function([x], assert_op(x, x.size<2))
"""
_f16_ok
=
True
__props__
=
(
'msg'
,)
view_map
=
{
0
:
[
0
]}
...
...
@@ -6063,20 +6063,24 @@ def local_log1p(node):
log_arg
.
owner
.
inputs
,
only_process_constants
=
True
)
# scalar_inputs are potentially dimshuffled and fill'd scalars
if
scalars
and
numpy
.
allclose
(
numpy
.
sum
(
scalars
),
1
):
if
not
nonconsts
:
pass
# leave for constant-merge
if
len
(
nonconsts
)
==
1
:
return
_fill_chain
(
T
.
log1p
(
nonconsts
[
0
]),
scalar_inputs
)
else
:
return
_fill_chain
(
T
.
log1p
(
T
.
add
(
*
nonconsts
)),
scalar_inputs
)
if
nonconsts
:
if
len
(
nonconsts
)
>
1
:
ninp
=
T
.
add
(
*
nonconsts
)
else
:
ninp
=
nonconsts
[
0
]
if
ninp
.
dtype
!=
log_arg
.
type
.
dtype
:
ninp
=
ninp
.
astype
(
node
.
outputs
[
0
]
.
dtype
)
return
_fill_chain
(
T
.
log1p
(
ninp
),
scalar_inputs
)
elif
log_arg
.
owner
and
log_arg
.
owner
.
op
==
T
.
sub
:
one
=
T
.
extract_constant
(
log_arg
.
owner
.
inputs
[
0
],
only_process_constants
=
True
)
if
one
!=
1
:
return
return
[
T
.
log1p
(
T
.
neg
(
log_arg
.
owner
.
inputs
[
1
]))]
other
=
log_arg
.
owner
.
inputs
[
1
]
if
other
.
dtype
!=
log_arg
.
dtype
:
other
=
other
.
astype
(
log_arg
.
dtype
)
return
[
T
.
log1p
(
T
.
neg
(
other
))]
# TODO: in canonicalize, change log10 and log2 -> log
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论