Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
645557f9
提交
645557f9
authored
10月 02, 2015
作者:
Pascal Lamblin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3476 from abergeron/move_config
Multiple fixes preparing for multi-gpu
上级
1ec1cd9b
71dea2cf
显示空白字符变更
内嵌
并排
正在显示
29 个修改的文件
包含
659 行增加
和
951 行删除
+659
-951
setup.cfg
setup.cfg
+3
-0
__init__.py
theano/__init__.py
+4
-2
configdefaults.py
theano/configdefaults.py
+111
-18
configparser.py
theano/configparser.py
+11
-7
op.py
theano/gof/op.py
+8
-4
__init__.py
theano/sandbox/cuda/__init__.py
+1
-1
dnn.py
theano/sandbox/cuda/dnn.py
+6
-6
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+3
-70
dnn_flags.py
theano/sandbox/dnn_flags.py
+0
-42
__init__.py
theano/sandbox/gpuarray/__init__.py
+5
-10
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+28
-258
conv.py
theano/sandbox/gpuarray/conv.py
+5
-25
dnn.py
theano/sandbox/gpuarray/dnn.py
+21
-46
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+43
-77
kernel_codegen.py
theano/sandbox/gpuarray/kernel_codegen.py
+2
-3
neighbours.py
theano/sandbox/gpuarray/neighbours.py
+4
-26
nerv.py
theano/sandbox/gpuarray/nerv.py
+2
-2
nnet.py
theano/sandbox/gpuarray/nnet.py
+23
-64
opt.py
theano/sandbox/gpuarray/opt.py
+2
-2
opt_util.py
theano/sandbox/gpuarray/opt_util.py
+219
-20
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+1
-11
test_basic_ops.py
theano/sandbox/gpuarray/tests/test_basic_ops.py
+23
-73
test_blas.py
theano/sandbox/gpuarray/tests/test_blas.py
+23
-20
test_conv_cuda_ndarray.py
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+98
-111
test_neighbours.py
theano/sandbox/gpuarray/tests/test_neighbours.py
+1
-3
test_nnet.py
theano/sandbox/gpuarray/tests/test_nnet.py
+7
-30
test_type.py
theano/sandbox/gpuarray/tests/test_type.py
+0
-3
type.py
theano/sandbox/gpuarray/type.py
+5
-4
test_flake8.py
theano/tests/test_flake8.py
+0
-13
没有找到文件。
setup.cfg
浏览文件 @
645557f9
[nosetest]
[nosetest]
match=^test
match=^test
nocapture=1
nocapture=1
[flake8]
ignore=E501,E123,E133
theano/__init__.py
浏览文件 @
645557f9
...
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
...
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano
.
sandbox
.
cuda
.
tests
.
test_driver
.
test_nvidia_driver1
()
theano
.
sandbox
.
cuda
.
tests
.
test_driver
.
test_nvidia_driver1
()
if
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
\
if
(
config
.
device
.
startswith
(
'cuda'
)
or
config
.
gpuarray
.
init_device
!=
''
:
config
.
device
.
startswith
(
'opencl'
)
or
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
# Use config.numpy to call numpy.seterr
...
...
theano/configdefaults.py
浏览文件 @
645557f9
...
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
...
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self
.
default
=
default
self
.
default
=
default
def
filter
(
val
):
def
filter
(
val
):
if
val
.
startswith
(
'cpu'
)
or
val
.
startswith
(
'gpu'
)
\
if
val
==
self
.
default
or
val
.
startswith
(
'gpu'
)
\
or
val
.
startswith
(
'opencl'
)
or
val
.
startswith
(
'cuda'
):
or
val
.
startswith
(
'opencl'
)
or
val
.
startswith
(
'cuda'
):
return
val
return
val
else
:
else
:
raise
ValueError
((
'Invalid value ("
%
s") for configuration '
raise
ValueError
((
'Invalid value ("
%
s") for configuration '
'variable "
%
s". Valid options start with '
'variable "
%
s". Valid options start with '
'one of "
cpu
", "gpu", "opencl", "cuda"'
'one of "
%
s
", "gpu", "opencl", "cuda"'
%
(
val
,
self
.
fullname
)))
%
(
self
.
default
,
val
,
self
.
fullname
)))
over
=
kwargs
.
get
(
"allow_override"
,
True
)
over
=
kwargs
.
get
(
"allow_override"
,
True
)
super
(
DeviceParam
,
self
)
.
__init__
(
default
,
filter
,
over
)
super
(
DeviceParam
,
self
)
.
__init__
(
default
,
filter
,
over
)
def
__str__
(
self
):
def
__str__
(
self
):
return
'
%
s (
cpu, gpu*, opencl*, cuda*) '
%
(
self
.
fullname
,
)
return
'
%
s (
%
s, gpu*, opencl*, cuda*) '
%
(
self
.
fullname
,
self
.
default
)
AddConfigVar
(
AddConfigVar
(
'device'
,
'device'
,
...
@@ -94,14 +94,6 @@ AddConfigVar(
...
@@ -94,14 +94,6 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if "
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."
),
"NVIDIA use capital letters."
),
DeviceParam
(
'cpu'
,
allow_override
=
False
),
DeviceParam
(
'cpu'
,
allow_override
=
False
),
in_c_key
=
False
,)
AddConfigVar
(
'gpuarray.init_device'
,
"""
Device to initialize for gpuarray use without moving
computations automatically.
"""
,
StrParam
(
''
),
in_c_key
=
False
)
in_c_key
=
False
)
AddConfigVar
(
AddConfigVar
(
...
@@ -110,12 +102,7 @@ AddConfigVar(
...
@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, "
"Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. "
"nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."
),
"It can be used to run GPU-specific tests on a particular GPU."
),
EnumStr
(
''
,
'gpu'
,
DeviceParam
(
''
,
allow_override
=
False
),
'gpu0'
,
'gpu1'
,
'gpu2'
,
'gpu3'
,
'gpu4'
,
'gpu5'
,
'gpu6'
,
'gpu7'
,
'gpu8'
,
'gpu9'
,
'gpu10'
,
'gpu11'
,
'gpu12'
,
'gpu13'
,
'gpu14'
,
'gpu15'
,
allow_override
=
False
),
in_c_key
=
False
)
in_c_key
=
False
)
AddConfigVar
(
AddConfigVar
(
...
@@ -131,6 +118,112 @@ AddConfigVar(
...
@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key
=
False
)
in_c_key
=
False
)
def
default_cuda_root
():
v
=
os
.
getenv
(
'CUDA_ROOT'
,
""
)
if
v
:
return
v
s
=
os
.
getenv
(
"PATH"
)
if
not
s
:
return
''
for
dir
in
s
.
split
(
os
.
path
.
pathsep
):
if
os
.
path
.
exists
(
os
.
path
.
join
(
dir
,
"nvcc"
)):
return
os
.
path
.
split
(
dir
)[
0
]
return
''
AddConfigVar
(
'cuda.root'
,
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
"""
,
StrParam
(
default_cuda_root
),
in_c_key
=
False
)
def
filter_nvcc_flags
(
s
):
assert
isinstance
(
s
,
str
)
flags
=
[
flag
for
flag
in
s
.
split
(
' '
)
if
flag
]
if
any
([
f
for
f
in
flags
if
not
f
.
startswith
(
"-"
)]):
raise
ValueError
(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '
%
s'"
%
s
)
return
' '
.
join
(
flags
)
AddConfigVar
(
'nvcc.flags'
,
"Extra compiler flags for nvcc"
,
ConfigParam
(
""
,
filter_nvcc_flags
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'nvcc.compiler_bindir'
,
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory"
,
StrParam
(
""
),
in_c_key
=
False
)
AddConfigVar
(
'nvcc.fastmath'
,
""
,
BoolParam
(
False
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'gpuarray.sync'
,
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling."""
,
BoolParam
(
False
),
in_c_key
=
True
)
AddConfigVar
(
'dnn.conv.workmem'
,
"This flag is deprecated; use dnn.conv.algo_fwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.workmem_bwd'
,
"This flag is deprecated; use dnn.conv.algo_bwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_fwd'
,
"Default implementation to use for CuDNN forward convolution."
,
EnumStr
(
'small'
,
'none'
,
'large'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_bwd'
,
"Default implementation to use for CuDNN backward convolution."
,
EnumStr
(
'none'
,
'deterministic'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
def
default_dnn_path
(
suffix
):
def
f
(
suffix
=
suffix
):
if
config
.
cuda
.
root
==
''
:
return
''
return
os
.
path
.
join
(
config
.
cuda
.
root
,
suffix
)
return
f
AddConfigVar
(
'dnn.include_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
default_dnn_path
(
'include'
)))
AddConfigVar
(
'dnn.library_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
default_dnn_path
(
'lib64'
)))
# This flag determines whether or not to raise error/warning message if
# This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph.
# there is a CPU Op in the computational graph.
AddConfigVar
(
AddConfigVar
(
...
...
theano/configparser.py
浏览文件 @
645557f9
...
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
...
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
if
v
.
fullname
==
k
]
if
v
.
fullname
==
k
]
assert
len
(
l
)
==
1
assert
len
(
l
)
==
1
old_val
[
k
]
=
l
[
0
]
.
__get__
()
old_val
[
k
]
=
l
[
0
]
.
__get__
(
True
,
None
)
try
:
try
:
for
k
in
kwargs
:
for
k
in
kwargs
:
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
...
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
...
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for
cv
in
_config_var_list
:
for
cv
in
_config_var_list
:
print
(
cv
,
file
=
buf
)
print
(
cv
,
file
=
buf
)
print
(
" Doc: "
,
cv
.
doc
,
file
=
buf
)
print
(
" Doc: "
,
cv
.
doc
,
file
=
buf
)
print
(
" Value: "
,
cv
.
__get__
(),
file
=
buf
)
print
(
" Value: "
,
cv
.
__get__
(
True
,
None
),
file
=
buf
)
print
(
""
,
file
=
buf
)
print
(
""
,
file
=
buf
)
...
@@ -182,7 +182,7 @@ def get_config_md5():
...
@@ -182,7 +182,7 @@ def get_config_md5():
all_opts
=
sorted
([
c
for
c
in
_config_var_list
if
c
.
in_c_key
],
all_opts
=
sorted
([
c
for
c
in
_config_var_list
if
c
.
in_c_key
],
key
=
lambda
cv
:
cv
.
fullname
)
key
=
lambda
cv
:
cv
.
fullname
)
return
theano
.
gof
.
utils
.
hash_from_code
(
'
\n
'
.
join
(
return
theano
.
gof
.
utils
.
hash_from_code
(
'
\n
'
.
join
(
[
'
%
s =
%
s'
%
(
cv
.
fullname
,
cv
.
__get__
())
for
cv
in
all_opts
]))
[
'
%
s =
%
s'
%
(
cv
.
fullname
,
cv
.
__get__
(
True
,
None
))
for
cv
in
all_opts
]))
class
TheanoConfigParser
(
object
):
class
TheanoConfigParser
(
object
):
...
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
...
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars
# Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user.
# This allow to filter wrong value from the user.
if
not
callable
(
configparam
.
default
):
if
not
callable
(
configparam
.
default
):
configparam
.
__get__
()
configparam
.
__get__
(
root
,
type
(
root
)
)
else
:
else
:
# We do not want to evaluate now the default value
# We do not want to evaluate now the default value
# when it is a callable.
# when it is a callable.
try
:
try
:
fetch_val_for_key
(
configparam
.
fullname
)
fetch_val_for_key
(
configparam
.
fullname
)
# The user provided a value, filter it now.
# The user provided a value, filter it now.
configparam
.
__get__
()
configparam
.
__get__
(
root
,
type
(
root
)
)
except
KeyError
:
except
KeyError
:
pass
pass
setattr
(
root
.
__class__
,
sections
[
0
],
configparam
)
setattr
(
root
.
__class__
,
sections
[
0
],
configparam
)
...
@@ -294,6 +294,7 @@ class ConfigParam(object):
...
@@ -294,6 +294,7 @@ class ConfigParam(object):
self
.
default
=
default
self
.
default
=
default
self
.
filter
=
filter
self
.
filter
=
filter
self
.
allow_override
=
allow_override
self
.
allow_override
=
allow_override
self
.
is_default
=
True
# N.B. --
# N.B. --
# self.fullname # set by AddConfigVar
# self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar
# self.doc # set by AddConfigVar
...
@@ -304,16 +305,19 @@ class ConfigParam(object):
...
@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is
# Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects.
# invalid and causes a crash or has unwanted side effects.
def
__get__
(
self
,
*
args
):
def
__get__
(
self
,
cls
,
type_
):
if
cls
is
None
:
return
self
if
not
hasattr
(
self
,
'val'
):
if
not
hasattr
(
self
,
'val'
):
try
:
try
:
val_str
=
fetch_val_for_key
(
self
.
fullname
)
val_str
=
fetch_val_for_key
(
self
.
fullname
)
self
.
is_default
=
False
except
KeyError
:
except
KeyError
:
if
callable
(
self
.
default
):
if
callable
(
self
.
default
):
val_str
=
self
.
default
()
val_str
=
self
.
default
()
else
:
else
:
val_str
=
self
.
default
val_str
=
self
.
default
self
.
__set__
(
None
,
val_str
)
self
.
__set__
(
cls
,
val_str
)
# print "RVAL", self.val
# print "RVAL", self.val
return
self
.
val
return
self
.
val
...
...
theano/gof/op.py
浏览文件 @
645557f9
...
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
...
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code
=
self
.
code_sections
[
tag
]
code
=
self
.
code_sections
[
tag
]
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
return
os
.
linesep
.
join
([
define_macros
,
code
,
return
os
.
linesep
.
join
([
''
,
define_macros
,
code
,
undef_macros
])
undef_macros
])
else
:
else
:
raise
utils
.
MethodNotDefined
(
raise
utils
.
MethodNotDefined
(
...
@@ -1428,7 +1428,7 @@ class COp(Op):
...
@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
def_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
def_sub
,
undef_sub
=
self
.
get_sub_macros
(
sub
)
def_sub
,
undef_sub
=
self
.
get_sub_macros
(
sub
)
return
os
.
linesep
.
join
([
def_macros
,
def_sub
,
return
os
.
linesep
.
join
([
''
,
def_macros
,
def_sub
,
op_code
,
op_code
,
undef_sub
,
undef_macros
])
undef_sub
,
undef_macros
])
else
:
else
:
...
@@ -1442,17 +1442,21 @@ class COp(Op):
...
@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
check_input
=
False
)
ctx
=
""
if
'context'
in
sub
:
ctx
=
",
%
s"
%
(
sub
[
'context'
],)
# Generate the C code
# Generate the C code
return
"""
return
"""
%(define_macros)
s
%(define_macros)
s
{
{
if (
%(func_name)
s(
%(func_args)
s) != 0) {
if (
%(func_name)
s(
%(func_args)
s
%(ctx)
s
) != 0) {
%(fail)
s
%(fail)
s
}
}
}
}
%(undef_macros)
s
%(undef_macros)
s
"""
%
dict
(
func_name
=
self
.
func_name
,
"""
%
dict
(
func_name
=
self
.
func_name
,
fail
=
sub
[
'fail'
],
fail
=
sub
[
'fail'
],
ctx
=
ctx
,
func_args
=
self
.
format_c_function_args
(
inp
,
out
),
func_args
=
self
.
format_c_function_args
(
inp
,
out
),
define_macros
=
define_macros
,
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
undef_macros
=
undef_macros
)
...
...
theano/sandbox/cuda/__init__.py
浏览文件 @
645557f9
...
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
...
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py
# import dependency. So we also test it in the file theano/__init__.py
if
config
.
device
.
startswith
(
'gpu'
):
if
config
.
device
.
startswith
(
'gpu'
):
use
(
device
=
config
.
device
,
force
=
config
.
force_device
,
test_driver
=
False
)
use
(
device
=
config
.
device
,
force
=
config
.
force_device
,
test_driver
=
False
)
elif
config
.
init_gpu_device
:
elif
config
.
init_gpu_device
.
startswith
(
'gpu'
)
:
assert
config
.
device
==
"cpu"
,
(
assert
config
.
device
==
"cpu"
,
(
"We can use the Theano flag init_gpu_device"
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'"
)
" only when the Theano flag device=='cpu'"
)
...
...
theano/sandbox/cuda/dnn.py
浏览文件 @
645557f9
...
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
...
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
import
theano.sandbox.dnn_flags
def
dnn_available
():
def
dnn_available
():
if
dnn_available
.
avail
is
None
:
if
dnn_available
.
avail
is
None
:
...
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
...
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
return 1;
}
}
"""
"""
params
=
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
)]
if
config
.
dnn
.
include_path
:
params
.
append
(
"-I"
+
config
.
dnn
.
include_path
)
if
config
.
dnn
.
library_path
:
params
.
append
(
"-L"
+
config
.
dnn
.
library_path
)
# Do not run here the test program. It would run on the
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
# exclusive mode, this cause bad detection.
comp
,
out
,
err
=
NVCC_compiler
.
try_flags
(
comp
,
out
,
err
=
NVCC_compiler
.
try_flags
(
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
),
params
=
params
,
preambule
=
preambule
,
body
=
body
,
"-I"
+
config
.
dnn
.
include_path
,
"-L"
+
config
.
dnn
.
library_path
],
preambule
=
preambule
,
body
=
body
,
try_run
=
False
,
output
=
True
)
try_run
=
False
,
output
=
True
)
dnn_available
.
avail
=
comp
dnn_available
.
avail
=
comp
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
645557f9
...
@@ -8,6 +8,7 @@ import warnings
...
@@ -8,6 +8,7 @@ import warnings
import
numpy
import
numpy
from
theano
import
config
from
theano.compat
import
decode
,
decode_iter
from
theano.compat
import
decode
,
decode_iter
from
theano.gof
import
local_bitwidth
from
theano.gof
import
local_bitwidth
from
theano.gof.utils
import
hash_from_file
from
theano.gof.utils
import
hash_from_file
...
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
...
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger
=
logging
.
getLogger
(
"theano.sandbox.cuda.nvcc_compiler"
)
_logger
=
logging
.
getLogger
(
"theano.sandbox.cuda.nvcc_compiler"
)
from
theano.configparser
import
(
config
,
AddConfigVar
,
StrParam
,
BoolParam
,
ConfigParam
)
AddConfigVar
(
'nvcc.compiler_bindir'
,
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory"
,
StrParam
(
""
),
in_c_key
=
False
)
user_provided_cuda_root
=
True
def
default_cuda_root
():
global
user_provided_cuda_root
v
=
os
.
getenv
(
'CUDA_ROOT'
,
""
)
user_provided_cuda_root
=
False
if
v
:
return
v
return
find_cuda_root
()
AddConfigVar
(
'cuda.root'
,
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
"""
,
StrParam
(
default_cuda_root
),
in_c_key
=
False
)
def
filter_nvcc_flags
(
s
):
assert
isinstance
(
s
,
str
)
flags
=
[
flag
for
flag
in
s
.
split
(
' '
)
if
flag
]
if
any
([
f
for
f
in
flags
if
not
f
.
startswith
(
"-"
)]):
raise
ValueError
(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '
%
s'"
%
s
)
return
' '
.
join
(
flags
)
AddConfigVar
(
'nvcc.flags'
,
"Extra compiler flags for nvcc"
,
ConfigParam
(
""
,
filter_nvcc_flags
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'nvcc.fastmath'
,
""
,
BoolParam
(
False
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
nvcc_path
=
'nvcc'
nvcc_path
=
'nvcc'
nvcc_version
=
None
nvcc_version
=
None
...
@@ -115,14 +55,6 @@ def is_nvcc_available():
...
@@ -115,14 +55,6 @@ def is_nvcc_available():
return
False
return
False
def
find_cuda_root
():
s
=
os
.
getenv
(
"PATH"
)
if
not
s
:
return
for
dir
in
s
.
split
(
os
.
path
.
pathsep
):
if
os
.
path
.
exists
(
os
.
path
.
join
(
dir
,
"nvcc"
)):
return
os
.
path
.
split
(
dir
)[
0
]
rpath_defaults
=
[]
rpath_defaults
=
[]
...
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
...
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but
# provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and
# otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details.
# https://wiki.debian.org/RpathIssue for details.
if
(
user_provided_cuda_root
and
if
(
not
type
(
config
.
cuda
)
.
root
.
is_default
and
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
...
...
theano/sandbox/dnn_flags.py
deleted
100644 → 0
浏览文件 @
1ec1cd9b
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import
os.path
from
theano.configparser
import
AddConfigVar
,
EnumStr
,
StrParam
from
theano
import
config
AddConfigVar
(
'dnn.conv.workmem'
,
"This flag is deprecated; use dnn.conv.algo_fwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.workmem_bwd'
,
"This flag is deprecated; use dnn.conv.algo_bwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_fwd'
,
"Default implementation to use for CuDNN forward convolution."
,
EnumStr
(
'small'
,
'none'
,
'large'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_bwd'
,
"Default implementation to use for CuDNN backward convolution."
,
EnumStr
(
'none'
,
'deterministic'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.include_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
lambda
:
os
.
path
.
join
(
config
.
cuda
.
root
,
'include'
)))
AddConfigVar
(
'dnn.library_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
lambda
:
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib64'
)))
theano/sandbox/gpuarray/__init__.py
浏览文件 @
645557f9
...
@@ -19,13 +19,6 @@ try:
...
@@ -19,13 +19,6 @@ try:
except
ImportError
:
except
ImportError
:
pygpu
=
None
pygpu
=
None
AddConfigVar
(
'gpuarray.sync'
,
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling."""
,
BoolParam
(
False
),
in_c_key
=
True
)
# This is for documentation not to depend on the availability of pygpu
# This is for documentation not to depend on the availability of pygpu
from
.type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
from
.type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
GpuArraySharedVariable
,
gpuarray_shared_constructor
)
GpuArraySharedVariable
,
gpuarray_shared_constructor
)
...
@@ -57,8 +50,9 @@ if pygpu:
...
@@ -57,8 +50,9 @@ if pygpu:
import
theano.compile
import
theano.compile
theano
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
theano
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
elif
config
.
gpuarray
.
init_device
!=
''
:
elif
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
init_dev
(
config
.
gpuarray
.
init_device
)
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
init_dev
(
config
.
init_gpu_device
)
from
.basic_ops
import
(
GpuAlloc
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
from
.basic_ops
import
(
GpuAlloc
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
GpuJoin
,
GpuReshape
,
GpuSplit
,
HostFromGpu
)
GpuJoin
,
GpuReshape
,
GpuSplit
,
HostFromGpu
)
...
@@ -70,7 +64,8 @@ if pygpu:
...
@@ -70,7 +64,8 @@ if pygpu:
except
Exception
:
except
Exception
:
error
(
"Could not initialize pygpu, support disabled"
,
exc_info
=
True
)
error
(
"Could not initialize pygpu, support disabled"
,
exc_info
=
True
)
else
:
else
:
if
(
config
.
gpuarray
.
init_device
!=
''
or
if
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'cuda'
)):
config
.
device
.
startswith
(
'cuda'
)):
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
theano/sandbox/gpuarray/basic_ops.py
浏览文件 @
645557f9
...
@@ -2,11 +2,9 @@ import os
...
@@ -2,11 +2,9 @@ import os
import
numpy
import
numpy
import
theano
from
theano
import
Op
,
Apply
,
Type
,
Variable
from
theano
import
Op
,
Apply
from
theano
import
tensor
,
config
from
theano
import
tensor
,
scalar
,
config
from
theano.gradient
import
grad_undefined
from
theano.gradient
import
grad_undefined
from
theano.scalar
import
Scalar
from
theano.tensor.basic
import
Alloc
,
Join
,
Split
from
theano.tensor.basic
import
Alloc
,
Join
,
Split
from
theano.gof
import
HideC
from
theano.gof
import
HideC
...
@@ -17,7 +15,7 @@ from six.moves import xrange
...
@@ -17,7 +15,7 @@ from six.moves import xrange
try
:
try
:
import
pygpu
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -293,7 +291,6 @@ class GpuFromHost(Op):
...
@@ -293,7 +291,6 @@ class GpuFromHost(Op):
def
perform
(
self
,
node
,
inp
,
out
):
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
x
,
=
inp
z
,
=
out
z
,
=
out
type
=
node
.
outputs
[
0
]
.
type
z
[
0
]
=
gpuarray
.
array
(
x
)
z
[
0
]
=
gpuarray
.
array
(
x
)
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
...
@@ -312,254 +309,29 @@ class GpuFromHost(Op):
...
@@ -312,254 +309,29 @@ class GpuFromHost(Op):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
return
"""
PyGpuArrayObject *
%(name)
s_tmp;
%(name)
s_tmp = PyArray_GETCONTIGUOUS(
%(inp)
s);
if (
%(name)
s_tmp == NULL)
%(fail)
s
Py_XDECREF(
%(out)
s);
Py_XDECREF(
%(out)
s);
%(out)
s = pygpu_fromhostdata(PyArray_DATA(
%(
inp)
s
),
%(out)
s = pygpu_fromhostdata(PyArray_DATA(
%(
name)
s_tmp
),
get_typecode((PyObject *)PyArray_DESCR(
%(
inp)
s
)),
get_typecode((PyObject *)PyArray_DESCR(
%(
name)
s_tmp
)),
PyArray_NDIM(
%(
inp)
s
),
PyArray_NDIM(
%(
name)
s_tmp
),
(size_t *)PyArray_DIMS(
%(
inp)
s
),
(size_t *)PyArray_DIMS(
%(
name)
s_tmp
),
(ssize_t *)PyArray_STRIDES(
%(
inp)
s
),
(ssize_t *)PyArray_STRIDES(
%(
name)
s_tmp
),
pygpu_default_context(),
pygpu_default_context(),
Py_None);
Py_None);
if (
%(out)
s == NULL) {
Py_DECREF(
%(name)
s_tmp);
if (
%(out)
s == NULL)
%(fail)
s
%(fail)
s
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
4
,)
gpu_from_host
=
GpuFromHost
()
class
GpuFromCuda
(
Op
):
view_map
=
{
0
:
[
0
]}
__props__
=
()
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
CudaNdarrayType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
dtype
=
x
.
dtype
)()])
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
z
[
0
]
=
gpuarray
.
array
(
numpy
.
asarray
(
x
))
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
cuda_from_gpu
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
if
isinstance
(
ev
,
GpuArrayType
):
return
[
cuda_from_gpu
(
ev
)]
else
:
return
ev
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<gpuarray/extension.h>'
,
'<gpuarray/types.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");'
,
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr;
gpudata *
%(name)
sdata;
CUcontext
%(name)
scur;
size_t *
%(name)
sdims;
ssize_t *
%(name)
sstr;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
%(name)
sdims = (size_t *)calloc(
%(in)
s->nd, sizeof(size_t));
if (
%(name)
sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)
s
}
%(name)
sstr = (ssize_t *)calloc(
%(in)
s->nd, sizeof(ssize_t));
if (
%(name)
sstr == NULL) {
free(
%(name)
sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)
s
}
for (unsigned int i = 0; i <
%(in)
s->nd; i++) {
%(name)
sdims[i] = (size_t)CudaNdarray_HOST_DIMS(
%(in)
s)[i];
%(name)
sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(
%(in)
s)[i]*4;
}
%(name)
sdata = cuda_make_buf(pygpu_default_context()->ctx,
(CUdeviceptr)
%(in)
s->devdata,
((size_t)
%(in)
s->data_allocated)*4);
if (
%(name)
sdata == NULL) {
Py_DECREF(
%(out)
s);
free(
%(name)
sdims);
free(
%(name)
sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = pygpu_fromgpudata(
%(name)
sdata, 0, GA_FLOAT,
%(in)
s->nd,
%(name)
sdims,
%(name)
sstr,
pygpu_default_context(), 1,
(PyObject *)
%(in)
s,
(PyObject *)&PyGpuArrayType);
pygpu_default_context()->ops->buffer_release(
%(name)
sdata);
free(
%(name)
sdims);
free(
%(name)
sstr);
if (
%(out)
s == NULL) {
%(fail)
s
}
"""
%
{
'name'
:
name
,
'in'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
5
,)
return
(
5
,)
gpu_from_cuda
=
GpuFromCuda
()
gpu_from_host
=
GpuFromHost
()
class
CudaFromGpu
(
Op
):
view_map
=
{
0
:
[
0
]}
__props__
=
()
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
GpuArrayType
):
raise
TypeError
(
x
)
if
x
.
type
.
dtype
!=
'float32'
:
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
CudaNdarrayType
(
broadcastable
=
x
.
broadcastable
)()])
def
perform
(
self
,
node
,
inp
,
out
):
from
theano.sandbox.cuda
import
filter
as
cuda_filter
x
,
=
inp
z
,
=
out
z
[
0
]
=
cuda_filter
(
theano
.
_asarray
(
x
,
dtype
=
'float32'
),
tuple
([
0
]
*
x
.
ndim
),
0
,
z
[
0
])
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
gpu_from_cuda
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
from
theano.sandbox.cuda
import
CudaNdarrayType
ev
,
=
eval_points
if
(
isinstance
(
ev
,
CudaNdarrayType
)):
return
[
gpu_from_cuda
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
shp
):
return
shp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<gpuarray/extension.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");'
,
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr = 0,
%(name)
si;
CUcontext
%(name)
scur;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
if (GpuArray_sync(&
%(inp)
s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = (CudaNdarray *)CudaNdarray_new_nd(
%(inp)
s->ga.nd);
if (!
%(out)
s) {
%(fail)
s
}
for (
%(name)
si = 0;
%(name)
si <
%(inp)
s->ga.nd;
%(name)
si++) {
CudaNdarray_set_dim(
%(out)
s,
%(name)
si,
%(inp)
s->ga.dimensions[
%(name)
si]);
CudaNdarray_set_stride(
%(out)
s,
%(name)
si,
%(inp)
s->ga.strides[
%(name)
si]/4);
}
%(name)
serr = CudaNdarray_set_device_data(
%(out)
s,
(float *)(((char *)cuda_get_ptr(
%(inp)
s->ga.data))+
%(inp)
s->ga.offset),
(PyObject *)
%(inp)
s);
if (
%(name)
serr) {
%(fail)
s
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
3
,)
cuda_from_gpu
=
CudaFromGpu
()
class
GpuAlloc
(
HideC
,
Alloc
):
class
GpuAlloc
(
HideC
,
Alloc
):
...
@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
...
@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
sh
,
bcast
=
self
.
validate_shape
(
shape
)
sh
,
bcast
=
self
.
validate_shape
(
shape
)
if
value
.
ndim
>
len
(
sh
):
if
value
.
ndim
>
len
(
sh
):
TypeError
(
"The GpuAlloc value to use has more dimensions "
TypeError
(
"The GpuAlloc value to use has more dimensions "
"than the specified shape"
,
v
.
ndim
,
len
(
sh
))
"than the specified shape"
,
v
alue
.
ndim
,
len
(
sh
))
otype
=
value
.
type
.
clone
(
broadcastable
=
bcast
)
otype
=
value
.
type
.
clone
(
broadcastable
=
bcast
)
return
Apply
(
self
,
[
value
]
+
sh
,
[
otype
()])
return
Apply
(
self
,
[
value
]
+
sh
,
[
otype
()])
...
@@ -686,14 +458,14 @@ class GpuAlloc(HideC, Alloc):
...
@@ -686,14 +458,14 @@ class GpuAlloc(HideC, Alloc):
return
(
2
,)
return
(
2
,)
def
do_constant_folding
(
self
,
node
):
def
do_constant_folding
(
self
,
node
):
from
.
import
subtensor
,
blas
for
client
in
node
.
outputs
[
0
]
.
clients
:
for
client
in
node
.
outputs
[
0
]
.
clients
:
if
client
[
0
]
==
'output'
:
if
client
[
0
]
==
'output'
:
# If the output is a constant, it will have to be deepcopied
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
# each time the function is called. So we do not fold.
return
False
return
False
elif
(
# The following ops work inplace of their input id 0.
# The following ops work inplace of their input id 0.
client
[
1
]
==
0
and
elif
(
client
[
1
]
==
0
and
isinstance
(
client
[
0
]
.
op
,
(
# Ops that will work inplace on the Alloc. So if they
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# get constant_folded, they would copy the
# constant and this is less efficients.
# constant and this is less efficients.
...
@@ -701,14 +473,13 @@ class GpuAlloc(HideC, Alloc):
...
@@ -701,14 +473,13 @@ class GpuAlloc(HideC, Alloc):
# Not doing the constant folding could also lower
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# the peak memory usage, as we the "constant" won't
# always exists.
# always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor,
isinstance
(
client
[
0
]
.
op
,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuIncSubtensor
,
(
subtensor
.
GpuIncSubtensor
,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuAdvancedIncSubtensor1
,
subtensor
.
GpuAdvancedIncSubtensor1
,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuAdvancedIncSubtensor1_dev20
,
subtensor
.
GpuAdvancedIncSubtensor1_dev20
,
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGemm
,
blas
.
GpuGemm
,
blas
.
GpuGemv
,
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGemv
,
blas
.
GpuGer
)
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGer
,
)):
))):
return
False
return
False
# If the clients is a transfer, we don't want to fold. We
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
# let the moving opt finish before deciding what to do.
...
@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
...
@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
code
=
code
,
name
=
"k"
,
code
=
code
,
name
=
"k"
,
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SIZE
],
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SIZE
],
flags
=
Kernel
.
get_flags
(
self
.
dtype
),
flags
=
Kernel
.
get_flags
(
self
.
dtype
),
objvar
=
'k_eye_'
+
name
,
objvar
=
'k_eye_'
+
name
)]
)]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
n
,
m
=
inp
n
,
m
=
inp
...
...
theano/sandbox/gpuarray/conv.py
浏览文件 @
645557f9
...
@@ -5,17 +5,15 @@ import theano
...
@@ -5,17 +5,15 @@ import theano
from
theano
import
config
,
gof
from
theano
import
config
,
gof
try
:
try
:
import
pygpu
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
from
six.moves
import
reduce
from
.comp
import
NVCC_compiler
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
)
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
from
theano.gof
import
utils
from
theano.gof
import
utils
class
GpuConv
(
GpuKernelBase
,
gof
.
Op
):
class
GpuConv
(
GpuKernelBase
,
gof
.
Op
):
"""
"""
Implement the batched and stacked 2d convolution on the gpu.
Implement the batched and stacked 2d convolution on the gpu.
...
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
...
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb
=
0
nb
=
0
if
self
.
kshp
is
not
None
:
if
self
.
kshp
is
not
None
:
nb
=
self
.
kshp
[
1
]
nb
=
self
.
kshp
[
1
]
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
def
c_headers
(
self
):
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
return
[
'<stdio.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
raise
MethodNotDefined
(
'cuda only'
)
return
[
'<stdint.h>'
,
'<stdio.h>'
,
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
# raise this whenever modifying any of the support_code_files
return
(
0
,
21
)
return
(
0
,
22
)
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
img
,
kern
=
inp
img
,
kern
=
inp
...
...
theano/sandbox/gpuarray/dnn.py
浏览文件 @
645557f9
...
@@ -26,10 +26,7 @@ from .conv import GpuConv
...
@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from
.nnet
import
GpuSoftmax
from
.nnet
import
GpuSoftmax
from
.opt
import
gpu_seqopt
,
register_opt
,
conv_groupopt
,
op_lifter
from
.opt
import
gpu_seqopt
,
register_opt
,
conv_groupopt
,
op_lifter
from
.opt_util
import
alpha_merge
,
output_merge
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
# We need to import this to define the flags.
from
theano.sandbox
import
dnn_flags
# noqa
def
dnn_available
():
def
dnn_available
():
...
@@ -50,7 +47,6 @@ def dnn_available():
...
@@ -50,7 +47,6 @@ def dnn_available():
dnn_available
.
avail
=
False
dnn_available
.
avail
=
False
preambule
=
"""
preambule
=
"""
#include <stdio.h>
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn.h>
#include <cudnn_helper.h>
#include <cudnn_helper.h>
"""
"""
...
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
...
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
return 1;
}
}
"""
"""
params
=
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
)]
if
config
.
dnn
.
include_path
:
params
.
append
(
"-I"
+
config
.
dnn
.
include_path
)
if
config
.
dnn
.
library_path
:
params
.
append
(
"-L"
+
config
.
dnn
.
library_path
)
# Do not run here the test program. It would run on the
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
# exclusive mode, this cause bad detection.
comp
,
out
,
err
=
GCC_compiler
.
try_flags
(
comp
,
out
,
err
=
GCC_compiler
.
try_flags
(
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
),
params
,
preambule
=
preambule
,
body
=
body
,
"-I"
+
config
.
dnn
.
include_path
,
"-L"
+
config
.
dnn
.
library_path
],
preambule
=
preambule
,
body
=
body
,
try_run
=
False
,
output
=
True
)
try_run
=
False
,
output
=
True
)
dnn_available
.
avail
=
comp
dnn_available
.
avail
=
comp
...
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
...
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
'conv_dnn'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
@local_optimizer
([
GpuDnnConv
],
inplace
=
True
)
@inplace_allocempty
(
GpuDnnConv
,
2
)
def
local_dnn_conv_inplace
(
node
):
def
local_dnn_conv_inplace
(
node
,
inputs
):
if
type
(
node
.
op
)
!=
GpuDnnConv
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@local_optimizer
([
GpuDnnConvGradW
],
inplace
=
True
)
@inplace_allocempty
(
GpuDnnConvGradW
,
2
)
def
local_dnn_convgw_inplace
(
node
):
def
local_dnn_convgw_inplace
(
node
,
inputs
):
if
type
(
node
.
op
)
!=
GpuDnnConvGradW
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@local_optimizer
([
GpuDnnConvGradI
],
inplace
=
True
)
@inplace_allocempty
(
GpuDnnConvGradI
,
2
)
def
local_dnn_convgi_inplace
(
node
):
def
local_dnn_convgi_inplace
(
node
,
inputs
):
if
type
(
node
.
op
)
!=
GpuDnnConvGradI
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
optdb
.
register
(
'local_dnna_conv_inplace'
,
optdb
.
register
(
'local_dnna_conv_inplace'
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
local_dnn_convgw_inplace
,
local_dnn_convgw_inplace
,
local_dnn_convgi_inplace
,
local_dnn_convgi_inplace
,
name
=
"local_dnn_conv_inplace"
),
name
=
"local_dnn
a
_conv_inplace"
),
70.0
,
'fast_run'
,
'inplace'
,
'gpuarray'
,
'cudnn'
)
70.0
,
'fast_run'
,
'inplace'
,
'gpuarray'
,
'cudnn'
)
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_conv_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_conv_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convw_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_convw_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convi_alpha_merge
(
node
,
*
inputs
):
def
local_dnn_convi_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_conv_output_merge
(
node
,
*
inputs
):
def
local_dnn_conv_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convw_output_merge
(
node
,
*
inputs
):
def
local_dnn_convw_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convi_output_merge
(
node
,
*
inputs
):
def
local_dnn_convi_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
...
...
theano/sandbox/gpuarray/elemwise.py
浏览文件 @
645557f9
from
__future__
import
print_function
from
__future__
import
print_function
import
copy
import
copy
import
os
from
theano.compat
import
izip
from
theano.compat
import
izip
import
numpy
import
numpy
import
theano
from
theano
import
Apply
,
scalar
,
config
from
theano
import
Apply
,
scalar
,
config
from
theano
import
scalar
as
scal
from
theano
import
scalar
as
scal
from
six.moves
import
StringIO
,
xrange
from
six.moves
import
StringIO
,
xrange
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.cmodule
import
GCC_compiler
from
theano.scalar
import
Scalar
from
theano.scalar
import
Scalar
from
theano.tensor.elemwise
import
(
Elemwise
,
DimShuffle
,
CAReduceDtype
)
from
theano.tensor.elemwise
import
(
Elemwise
,
DimShuffle
,
CAReduceDtype
)
...
@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins
=
[
scalar
.
get_scalar_type
(
i
.
dtype
)
for
i
in
node
.
inputs
]
scal_v_ins
=
[
scalar
.
get_scalar_type
(
i
.
dtype
)
for
i
in
node
.
inputs
]
outs
=
[
make_argument
(
o
,
'o
%
d'
%
(
n
,))
for
n
,
o
in
outs
=
[
make_argument
(
o
,
'o
%
d'
%
(
n
,))
for
n
,
o
in
enumerate
(
node
.
outputs
)
if
n
ot
n
in
self
.
inplace_pattern
]
enumerate
(
node
.
outputs
)
if
n
not
in
self
.
inplace_pattern
]
scal_v_outs
=
[
scalar
.
get_scalar_type
(
o
.
dtype
)
for
o
in
node
.
outputs
]
scal_v_outs
=
[
scalar
.
get_scalar_type
(
o
.
dtype
)
for
o
in
node
.
outputs
]
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_v_ins
],
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_v_ins
],
...
@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else
:
else
:
scal_out
.
append
(
arg
.
name
+
'[i]'
)
scal_out
.
append
(
arg
.
name
+
'[i]'
)
kop
=
self
.
scalar_op
.
c_code
(
fake_node
,
nodename
+
'_scalar'
,
kop
=
self
.
scalar_op
.
c_code
(
fake_node
,
nodename
+
'_scalar'
,
scal_in
,
scal_out
,
scal_in
,
scal_out
,
dict
(
fail
=
'return;'
))
dict
(
fail
=
'return;'
))
...
@@ -171,25 +168,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -171,25 +168,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
(
"npy_float64"
,
"ga_double"
),
(
"npy_float64"
,
"ga_double"
),
]:
]:
kop
=
kop
.
replace
(
npy
,
ga
)
kop
=
kop
.
replace
(
npy
,
ga
)
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_compiler
(
self
):
return
GCC_compiler
def
c_headers
(
self
):
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
raise
MethodNotDefined
(
'cuda only'
)
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_support_code
(
self
):
def
c_support_code
(
self
):
return
self
.
scalar_op
.
c_support_code
()
return
self
.
scalar_op
.
c_support_code
()
...
@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
node
.
outputs
[
0
]
.
type
.
dtype
),
node
.
outputs
[
0
]
.
type
.
dtype
),
objvar
=
'elem_
%
d_
%
s'
%
(
nd
,
nodename
))]
objvar
=
'elem_
%
d_
%
s'
%
(
nd
,
nodename
))]
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
raise
MethodNotDefined
(
'cuda only'
)
...
@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&
%(kname)
s, err));
GpuKernel_error(&
%(kname)
s, err));
%(fail)
s;
%(fail)
s;
}
}
"""
%
dict
(
kname
=
kname
,
fail
=
fail
)
"""
%
dict
(
kname
=
kname
,
fail
=
fail
)
if
config
.
gpuarray
.
sync
:
if
config
.
gpuarray
.
sync
:
code
+=
"""
code
+=
"""
err = GpuArray_sync(&
%(z)
s->ga);
err = GpuArray_sync(&
%(z)
s->ga);
...
@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
...
@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
ver
=
self
.
scalar_op
.
c_code_cache_version
()
ver
=
self
.
scalar_op
.
c_code_cache_version
()
if
ver
:
if
ver
:
return
(
3
,
ver
)
return
(
4
,
ver
)
else
:
else
:
return
ver
return
ver
...
@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
...
@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res
=
input
res
=
input
res
=
res
.
transpose
(
self
.
shuffle
+
self
.
drop
)
res
=
res
.
transpose
(
self
.
shuffle
+
self
.
drop
)
shape
=
list
(
res
.
shape
[:
len
(
self
.
shuffle
)])
shape
=
list
(
res
.
shape
[:
len
(
self
.
shuffle
)])
for
augm
in
self
.
augment
:
for
augm
in
self
.
augment
:
...
@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
...
@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp);
Py_DECREF(tmp);
return res;
return res;
}
}
"""
%
dict
(
shuffle
=
', '
.
join
(
str
(
a
)
for
a
in
(
self
.
shuffle
+
self
.
drop
)),
"""
%
dict
(
shuffle
=
', '
.
join
(
str
(
a
)
for
a
in
(
self
.
shuffle
+
self
.
drop
)),
name
=
name
,
nd_out
=
len
(
self
.
new_order
),
name
=
name
,
nd_out
=
len
(
self
.
new_order
),
copy_shape
=
copy_shape
(
len
(
self
.
new_order
)))
copy_shape
=
copy_shape
(
len
(
self
.
new_order
)))
...
@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
...
@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return
process
return
process
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
5
,)
class
GpuCAReduceCuda
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
class
GpuCAReduceCuda
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
...
@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if
self
.
pre_scalar_op
:
if
self
.
pre_scalar_op
:
# Currently we only tested pre_scalar_op that don't cause
# Currently we only tested pre_scalar_op that don't cause
# upcast.
# upcast.
d1
=
self
.
__class__
(
scalar_op
=
self
.
scalar_op
)(
Elemwise
(
self
.
pre_scalar_op
)(
x
))
assert
d1
.
dtype
==
ret
.
outputs
[
0
]
.
dtype
assert
Elemwise
(
self
.
pre_scalar_op
)(
x
)
.
dtype
==
x
.
dtype
assert
Elemwise
(
self
.
pre_scalar_op
)(
x
)
.
dtype
==
x
.
dtype
if
self
.
reduce_mask
is
None
:
if
self
.
reduce_mask
is
None
:
if
self
.
axis
is
None
:
if
self
.
axis
is
None
:
...
@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return
False
return
False
return
True
return
True
def
c_header_dirs
(
self
):
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
x
,
=
inp
...
@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sio
=
StringIO
()
sio
=
StringIO
()
fail
=
sub
[
'fail'
]
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
# check input
# check input
print
(
"""
print
(
"""
...
@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(
%(z)
s);
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(out_typecode)
s, GA_C_ORDER,
%(out_typecode)
s, GA_C_ORDER,
pygpu_default_context(),
pygpu_default_context(), Py_None);
Py_None);
if (NULL ==
%(z)
s)
if (NULL ==
%(z)
s)
{
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...
@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
# TODO: if only some dims are ccontiguous, call version with less dims.
print
(
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS){'
%
locals
(),
file
=
sio
)
print
(
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS){'
%
locals
(),
file
=
sio
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}else{"
,
file
=
sio
)
print
(
"}else{"
,
file
=
sio
)
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}"
,
file
=
sio
)
print
(
"}"
,
file
=
sio
)
else
:
else
:
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
# \end bracket the reduction ...
# \end bracket the reduction ...
...
@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else
:
else
:
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
scal
.
Minimum
))
scal
.
Minimum
))
if
self
.
pre_scalar_op
:
# TODO
, multi_dtype!
if
self
.
pre_scalar_op
:
# TODO
: multiple dtypes
#dtype = node.inputs[0].dtype
#
dtype = node.inputs[0].dtype
dtype
=
'float32'
dtype
=
'float32'
dummy_var
=
scal
.
Scalar
(
dtype
=
dtype
)()
dummy_var
=
scal
.
Scalar
(
dtype
=
dtype
)()
...
@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
%
locals
(),
file
=
sio
)
"""
%
locals
(),
file
=
sio
)
def
c_code_cache_version_apply
(
self
,
node
):
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
1
6
]
# the version corresponding to the c code in this Op
version
=
[
1
7
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
self
.
scalar_op
,
[
Scalar
(
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
for
i
in
node
.
inputs
+
node
.
outputs
:
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
...
@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype
=
node
.
inputs
[
0
]
.
dtype
in_dtype
=
node
.
inputs
[
0
]
.
dtype
out_dtype
=
node
.
outputs
[
0
]
.
dtype
out_dtype
=
node
.
outputs
[
0
]
.
dtype
acc_dtype
=
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
acc_dtype
=
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
flags
=
Kernel
.
get_flags
(
in_dtype
,
acc_dtype
,
out_dtype
)
flags
=
Kernel
.
get_flags
(
in_dtype
,
acc_dtype
,
out_dtype
)
in_type
=
gpuarray
.
dtype_to_ctype
(
in_dtype
)
in_type
=
gpuarray
.
dtype_to_ctype
(
in_dtype
)
out_type
=
gpuarray
.
dtype_to_ctype
(
out_dtype
)
out_type
=
gpuarray
.
dtype_to_ctype
(
out_dtype
)
acc_type
=
gpuarray
.
dtype_to_ctype
(
acc_dtype
)
acc_type
=
gpuarray
.
dtype_to_ctype
(
acc_dtype
)
...
@@ -2106,7 +2071,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2106,7 +2071,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
]
]
kernels
.
append
(
Kernel
(
code
=
sio
.
getvalue
(),
name
=
kname
,
kernels
.
append
(
Kernel
(
code
=
sio
.
getvalue
(),
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
#01, 011, 0111
#
01, 011, 0111
if
(
0
==
self
.
reduce_mask
[
0
]
and
if
(
0
==
self
.
reduce_mask
[
0
]
and
all
(
self
.
reduce_mask
[
1
:])
and
all
(
self
.
reduce_mask
[
1
:])
and
nd_in
in
[
2
,
3
,
4
]):
nd_in
in
[
2
,
3
,
4
]):
...
@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO),
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.x = dim 2 contiguous
# thread.y = dim 1
# thread.y = dim 1
# block.x = dim 0
# block.x = dim 0
# block.y = dim 1 rest
# block.y = dim 1 rest
init
=
self
.
_k_init
(
node
,
nodename
)
init
=
self
.
_k_init
(
node
,
nodename
)
decl
,
kname
,
params
,
k_var
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
decl
,
kname
,
params
,
k_var
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
...
@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[blockIdx.x * sA1])"
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[blockIdx.x * sA1])"
)
kname
=
"kernel_reduce_1011"
kname
=
"kernel_reduce_1011"
k_var
=
"kernel_reduce_1011_"
+
nodename
k_var
=
"kernel_reduce_1011_"
+
nodename
sio
=
StringIO
()
sio
=
StringIO
()
print
(
"""
print
(
"""
KERNEL void
%(kname)
s(
KERNEL void
%(kname)
s(
...
@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags
=
Kernel
.
get_flags
(
node
.
inputs
[
0
]
.
type
.
dtype
,
flags
=
Kernel
.
get_flags
(
node
.
inputs
[
0
]
.
type
.
dtype
,
acc_dtype
,
acc_dtype
,
node
.
outputs
[
0
]
.
type
.
dtype
),
node
.
outputs
[
0
]
.
type
.
dtype
),
objvar
=
'k_reduk_'
+
name
)]
objvar
=
'k_reduk_'
+
name
)]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
if
not
any
(
getattr
(
self
,
'redux'
,
[
node
.
inputs
[
0
]
.
ndim
!=
0
])):
if
not
any
(
getattr
(
self
,
'redux'
,
[
node
.
inputs
[
0
]
.
ndim
!=
0
])):
...
@@ -2768,7 +2733,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2768,7 +2733,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (
%(sync)
d)
if (
%(sync)
d)
GpuArray_sync(&
%(out)
s->ga);
GpuArray_sync(&
%(out)
s->ga);
"""
%
dict
(
out
=
out
[
0
],
inp
=
inp
[
0
],
fail
=
sub
[
'fail'
],
"""
%
dict
(
out
=
out
[
0
],
inp
=
inp
[
0
],
fail
=
sub
[
'fail'
],
sync
=
bool
(
config
.
gpuarray
.
sync
))
sync
=
bool
(
config
.
gpuarray
.
sync
))
k
=
self
.
get_kernel_cache
(
node
)
k
=
self
.
get_kernel_cache
(
node
)
_
,
src
,
_
,
ls
=
k
.
_get_basic_kernel
(
k
.
init_local_size
,
_
,
src
,
_
,
ls
=
k
.
_get_basic_kernel
(
k
.
init_local_size
,
...
@@ -2816,7 +2781,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2816,7 +2781,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
else
:
else
:
code
+=
"""
code
+=
"""
...
@@ -2828,7 +2793,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2828,7 +2793,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
if
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
:
if
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
:
...
@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)
s, GA_C_ORDER, pygpu_default_context(),
%(acc_type)
s, GA_C_ORDER, pygpu_default_context(),
Py_None);
Py_None);
if (!tmp)
%(fail)
s
if (!tmp)
%(fail)
s
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
acc_type
=
dtype_to_typecode
(
acc_dtype
))
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
acc_type
=
dtype_to_typecode
(
acc_dtype
))
else
:
else
:
code
+=
"""
code
+=
"""
tmp =
%(output)
s;
tmp =
%(output)
s;
Py_INCREF(tmp);
Py_INCREF(tmp);
"""
%
dict
(
output
=
output
)
"""
%
dict
(
output
=
output
)
# We need the proxies since we are passing a pointer to the
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
# data into the call and therefore we need a real copy of the
...
@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code
+=
"""
code
+=
"""
args[0] = &n;
args[0] = &n;
args[1] = tmp->ga.data;
args[1] = tmp->ga.data;
"""
%
dict
(
output
=
output
)
"""
%
dict
(
output
=
output
)
p
=
2
p
=
2
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
...
@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[
%(i)
s] =
%(input)
s->ga.dimensions[
%(i)
s];
proxy_dim[
%(i)
s] =
%(input)
s->ga.dimensions[
%(i)
s];
args[
%(p)
s] = &proxy_dim[
%(i)
s];
args[
%(p)
s] = &proxy_dim[
%(i)
s];
n *=
%(input)
s->ga.dimensions[
%(i)
s];
n *=
%(input)
s->ga.dimensions[
%(i)
s];
"""
%
dict
(
i
=
i
,
p
=
p
,
input
=
input
)
"""
%
dict
(
i
=
i
,
p
=
p
,
input
=
input
)
p
+=
1
p
+=
1
if
not
redux
[
i
]:
if
not
redux
[
i
]:
code
+=
"gs *=
%(input)
s->ga.dimensions[
%(i)
s];"
%
dict
(
input
=
input
,
i
=
i
)
code
+=
"gs *=
%(input)
s->ga.dimensions[
%(i)
s];"
%
dict
(
input
=
input
,
i
=
i
)
...
@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[
%(p)
s] =
%(input)
s->ga.data;
args[
%(p)
s] =
%(input)
s->ga.data;
proxy_off =
%(input)
s->ga.offset;
proxy_off =
%(input)
s->ga.offset;
args[
%(p)
s+1] = &proxy_off;
args[
%(p)
s+1] = &proxy_off;
"""
%
dict
(
p
=
p
,
input
=
input
)
"""
%
dict
(
p
=
p
,
input
=
input
)
p
+=
2
p
+=
2
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
code
+=
"""
code
+=
"""
proxy_str[
%(i)
s] =
%(input)
s->ga.strides[
%(i)
s];
proxy_str[
%(i)
s] =
%(input)
s->ga.strides[
%(i)
s];
args[
%(p)
s] = &proxy_str[
%(i)
s];
args[
%(p)
s] = &proxy_str[
%(i)
s];
"""
%
dict
(
p
=
p
,
i
=
i
,
input
=
input
)
"""
%
dict
(
p
=
p
,
i
=
i
,
input
=
input
)
p
+=
1
p
+=
1
code
+=
"""
code
+=
"""
...
@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
%(fail)
s
}
}
}
}
"""
%
dict
(
k_var
=
'k_reduk_'
+
name
,
sync
=
bool
(
config
.
gpuarray
.
sync
),
"""
%
dict
(
k_var
=
'k_reduk_'
+
name
,
sync
=
bool
(
config
.
gpuarray
.
sync
),
ls
=
ls
,
fail
=
sub
[
'fail'
],
output
=
output
,
input
=
input
,
ls
=
ls
,
fail
=
sub
[
'fail'
],
output
=
output
,
input
=
input
,
cast_out
=
bool
(
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
))
cast_out
=
bool
(
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
))
return
code
return
code
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
,
self
.
GpuKernelBase_version
)
return
(
2
,
self
.
GpuKernelBase_version
)
def
generate_kernel
(
self
,
node
,
odtype
,
redux
):
def
generate_kernel
(
self
,
node
,
odtype
,
redux
):
if
isinstance
(
self
.
scalar_op
,
scalar
.
basic
.
Add
):
if
isinstance
(
self
.
scalar_op
,
scalar
.
basic
.
Add
):
...
@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
...
@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux
=
self
.
redux
redux
=
self
.
redux
if
any
(
redux
):
if
any
(
redux
):
output
[
0
]
=
self
.
get_kernel_cache
(
node
)(
input
)
.
astype
(
copy
=
False
,
output
[
0
]
=
self
.
get_kernel_cache
(
node
)(
input
)
.
astype
(
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
copy
=
False
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
else
:
else
:
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
...
...
theano/sandbox/gpuarray/kernel_codegen.py
浏览文件 @
645557f9
...
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
...
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
"""
"""
try
:
try
:
import
pygpu
from
pygpu
import
gpuarray
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
def
nvcc_kernel
(
name
,
params
,
body
):
def
nvcc_kernel
(
name
,
params
,
body
):
"""
"""
Return the c code of a kernel function.
Return the c code of a kernel function.
...
@@ -174,9 +174,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
...
@@ -174,9 +174,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
"""
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
return
[
# get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
return
[
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
'__syncthreads()'
,
(
'
%
s row_max = '
+
buf
+
'[0]'
)
%
ctype
,
(
'
%
s row_max = '
+
buf
+
'[0]'
)
%
ctype
,
'__syncthreads()'
,
'__syncthreads()'
,
...
...
theano/sandbox/gpuarray/neighbours.py
浏览文件 @
645557f9
import
os
import
numpy
import
numpy
from
theano
import
Op
,
Apply
,
config
from
theano
import
Op
,
Apply
,
config
from
theano.gof
import
local_optimizer
from
theano.tensor.nnet.neighbours
import
Images2Neibs
from
theano.tensor.nnet.neighbours
import
Images2Neibs
import
theano.tensor
as
T
import
theano.tensor
as
T
try
:
try
:
import
pygpu
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
host_from_gpu
,
gpu_from_host
,
GpuKernelBase
,
Kernel
)
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.type
import
GpuArrayType
from
.type
import
GpuArrayType
from
.comp
import
NVCC_compiler
class
GpuImages2Neibs
(
GpuKernelBase
,
Images2Neibs
,
Op
):
class
GpuImages2Neibs
(
GpuKernelBase
,
Images2Neibs
,
Op
):
...
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype
=
ten4
.
type
.
dtype
)()])
dtype
=
ten4
.
type
.
dtype
)()])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
0
,
1
)
return
(
1
1
,
)
def
c_headers
(
self
):
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
raise
MethodNotDefined
(
'cuda only'
)
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
def
gpu_kernels
(
self
,
node
,
nodename
):
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
...
...
theano/sandbox/gpuarray/nerv.py
浏览文件 @
645557f9
...
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
...
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt
()
@opt.register_opt
()
@alpha_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
nd
=
2
)
@alpha_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gemm16_alpha_merge
(
node
,
*
inputs
):
def
local_gemm16_alpha_merge
(
node
,
*
inputs
):
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
@opt.register_opt
()
@opt.register_opt
()
@output_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
,
nd
=
2
)
@output_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gemm16_output_merge
(
node
,
*
inputs
):
def
local_gemm16_output_merge
(
node
,
*
inputs
):
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
...
...
theano/sandbox/gpuarray/nnet.py
浏览文件 @
645557f9
from
__future__
import
print_function
from
__future__
import
print_function
import
numpy
import
numpy
import
os
from
theano
import
Op
,
Apply
,
config
from
theano
import
Op
,
Apply
,
config
from
six
import
StringIO
from
six
import
StringIO
try
:
try
:
import
pygpu
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
except
ImportError
:
pass
pass
...
@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
am
=
y_idx
.
type
()
am
=
y_idx
.
type
()
return
Apply
(
self
,
[
x
,
b
,
y_idx
],
[
nll
,
sm
,
am
])
return
Apply
(
self
,
[
x
,
b
,
y_idx
],
[
nll
,
sm
,
am
])
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/types.h>'
]
def
gpu_kernels
(
self
,
node
,
nodename
):
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
...
@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return
sio
.
getvalue
()
return
sio
.
getvalue
()
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
7
,)
return
(
8
,)
gpu_crossentropy_softmax_argmax_1hot_with_bias
=
GpuCrossentropySoftmaxArgmax1HotWithBias
()
gpu_crossentropy_softmax_argmax_1hot_with_bias
=
GpuCrossentropySoftmaxArgmax1HotWithBias
()
...
@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
...
@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return
Apply
(
self
,
[
dnll
,
sm
,
y_idx
],
[
sm
.
type
()])
return
Apply
(
self
,
[
dnll
,
sm
,
y_idx
],
[
sm
.
type
()])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
10
,)
return
(
11
,)
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
typecode_dx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
typecode_dx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
...
@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
return
shape
return
shape
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
14
,)
+
inline_softmax
.
code_version
return
(
15
,)
+
inline_softmax
.
code_version
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
@@ -665,15 +637,15 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -665,15 +637,15 @@ class GpuSoftmax(GpuKernelBase, Op):
]
]
kernels
=
[]
kernels
=
[]
kname
=
"kSoftmax"
kname
=
"kSoftmax"
k_var
=
"kSoftmax_"
+
nodename
k_var
=
"kSoftmax_"
+
nodename
code
=
nvcc_kernel
(
kname
,
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
...
@@ -696,15 +668,15 @@ class GpuSoftmax(GpuKernelBase, Op):
...
@@ -696,15 +668,15 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmax_fixed_shared"
kname
=
"kSoftmax_fixed_shared"
k_var
=
"kSoftmax_fixed_shared"
+
nodename
k_var
=
"kSoftmax_fixed_shared"
+
nodename
code
=
nvcc_kernel
(
kname
,
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;"
...
@@ -746,23 +718,10 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -746,23 +718,10 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return
[
shape
[
0
]]
return
[
shape
[
0
]]
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
13
,)
+
inline_softmax
.
code_version
return
(
14
,)
+
inline_softmax
.
code_version
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
@@ -892,7 +851,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -892,7 +851,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
kernels
=
[]
kernels
=
[]
kname
=
"kSoftmaxWithBias"
kname
=
"kSoftmaxWithBias"
k_var
=
"kSoftmaxWithBias_"
+
nodename
k_var
=
"kSoftmaxWithBias_"
+
nodename
code
=
nvcc_kernel
(
kname
,
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
...
@@ -900,8 +860,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -900,8 +860,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'const ga_ssize sb0'
,
'const ga_ssize sb0'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
...
@@ -926,7 +885,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -926,7 +885,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
flags
=
flags
,
objvar
=
k_var
))
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmaxWithBias_fixed_shared"
kname
=
"kSoftmaxWithBias_fixed_shared"
k_var
=
"kSoftmaxWithBias_fixed_shared"
+
nodename
k_var
=
"kSoftmaxWithBias_fixed_shared"
+
nodename
code
=
nvcc_kernel
(
kname
,
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
...
@@ -934,8 +894,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
...
@@ -934,8 +894,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'const ga_ssize sb0'
,
'const ga_ssize sb0'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
...
...
theano/sandbox/gpuarray/opt.py
浏览文件 @
645557f9
...
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
...
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt
()
@register_opt
()
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
nd
=
2
)
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gpuagemm_alpha_merge
(
node
,
*
inputs
):
def
local_gpuagemm_alpha_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
return
[
gpugemm_no_inplace
(
*
inputs
)]
@register_opt
()
@register_opt
()
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
,
nd
=
2
)
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gpuagemm_output_merge
(
node
,
*
inputs
):
def
local_gpuagemm_output_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
return
[
gpugemm_no_inplace
(
*
inputs
)]
...
...
theano/sandbox/gpuarray/opt_util.py
浏览文件 @
645557f9
...
@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
...
@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
NotScalarConstantError
)
NotScalarConstantError
)
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
,
GpuAllocEmpty
from
.elemwise
import
GpuDimShuffle
,
GpuElemwise
from
.elemwise
import
GpuDimShuffle
,
GpuElemwise
_one
=
scal
.
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float64'
))
_one
=
scal
.
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float64'
))
def
grab_cpu_scalar
(
v
,
nd
):
def
grab_cpu_scalar
(
v
,
nd
):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v : variable
Theano variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if
v
.
owner
is
not
None
:
if
v
.
owner
is
not
None
:
n
=
v
.
owner
n
=
v
.
owner
if
(
isinstance
(
n
.
op
,
GpuDimShuffle
)
and
if
(
isinstance
(
n
.
op
,
(
GpuDimShuffle
,
DimShuffle
))
and
n
.
op
.
new_order
==
(
'x'
,)
*
nd
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
])
elif
(
isinstance
(
n
.
op
,
DimShuffle
)
and
n
.
op
.
new_order
==
(
'x'
,)
*
nd
):
n
.
op
.
new_order
==
(
'x'
,)
*
nd
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
])
return
grab_cpu_scalar
(
n
.
inputs
[
0
]
,
n
.
inputs
[
0
]
.
ndim
)
elif
isinstance
(
n
.
op
,
GpuFromHost
):
elif
isinstance
(
n
.
op
,
(
GpuFromHost
,
HostFromGpu
)
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
],
nd
=
nd
)
return
grab_cpu_scalar
(
n
.
inputs
[
0
],
nd
)
else
:
else
:
return
None
return
None
else
:
else
:
...
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
...
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
def
find_node
(
v
,
cls
,
ignore_clients
=
False
):
def
find_node
(
v
,
cls
,
ignore_clients
=
False
):
# This digs through possibly redundant transfers to for the node
"""
# that has the op class specified. If ignore_clients is False (the
Find the node that has an op of of type `cls` in `v`.
# default) it will only dig through nodes that have a single
# client.
This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v : variable
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if
v
.
owner
is
not
None
and
(
ignore_clients
or
len
(
v
.
clients
)
==
1
):
if
v
.
owner
is
not
None
and
(
ignore_clients
or
len
(
v
.
clients
)
==
1
):
if
isinstance
(
v
.
owner
.
op
,
cls
):
if
isinstance
(
v
.
owner
.
op
,
cls
):
return
v
.
owner
return
v
.
owner
...
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
...
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
def
is_equal
(
var
,
val
):
def
is_equal
(
var
,
val
):
# Returns True if var is always equal to val (python value), False
"""
# otherwise (including if var is not constant)
Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var : variable
Variable to compare
val : value
Python value
"""
try
:
try
:
v
=
get_scalar_constant_value
(
var
)
v
=
get_scalar_constant_value
(
var
)
return
v
==
val
return
v
==
val
...
@@ -59,7 +97,57 @@ def is_equal(var, val):
...
@@ -59,7 +97,57 @@ def is_equal(var, val):
return
False
return
False
def
alpha_merge
(
cls
,
alpha_in
,
beta_in
,
nd
):
def
alpha_merge
(
cls
,
alpha_in
,
beta_in
):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of scal * <yourop>(some, params, alpha,
beta) and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def
wrapper
(
maker
):
def
wrapper
(
maker
):
@local_optimizer
([
GpuElemwise
])
@local_optimizer
([
GpuElemwise
])
@wraps
(
maker
)
@wraps
(
maker
)
...
@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
...
@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
targ
=
find_node
(
node
.
inputs
[
0
],
cls
)
targ
=
find_node
(
node
.
inputs
[
0
],
cls
)
if
targ
is
None
:
if
targ
is
None
:
targ
=
find_node
(
node
.
inputs
[
1
],
cls
)
targ
=
find_node
(
node
.
inputs
[
1
],
cls
)
lr
=
grab_cpu_scalar
(
node
.
inputs
[
0
],
nd
=
nd
)
if
targ
is
None
:
return
lr
=
grab_cpu_scalar
(
node
.
inputs
[
0
],
nd
=
targ
.
outputs
[
0
]
.
ndim
)
else
:
else
:
lr
=
grab_cpu_scalar
(
node
.
inputs
[
1
],
nd
=
nd
)
lr
=
grab_cpu_scalar
(
node
.
inputs
[
1
],
if
(
lr
is
None
or
targ
is
None
or
nd
=
targ
.
outputs
[
0
]
.
ndim
)
lr
.
dtype
!=
targ
.
outputs
[
0
]
.
dtype
)
:
if
lr
is
None
or
lr
.
dtype
!=
targ
.
outputs
[
0
]
.
dtype
:
return
None
return
None
inputs
=
list
(
targ
.
inputs
)
inputs
=
list
(
targ
.
inputs
)
try
:
try
:
...
@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
...
@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
return
wrapper
return
wrapper
def
output_merge
(
cls
,
alpha_in
,
beta_in
,
out_in
,
nd
):
def
output_merge
(
cls
,
alpha_in
,
beta_in
,
out_in
):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of val * <yourop>(some, params, alpha,
beta, out_like) and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def
wrapper
(
maker
):
def
wrapper
(
maker
):
@local_optimizer
([
GpuElemwise
])
@local_optimizer
([
GpuElemwise
])
@wraps
(
maker
)
@wraps
(
maker
)
...
@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
...
@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
return
maker
(
targ
,
*
inputs
)
return
maker
(
targ
,
*
inputs
)
return
opt
return
opt
return
wrapper
return
wrapper
def
inplace_allocempty
(
op
,
idx
):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature:
maker(node, inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as:
def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs)
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
This returns an unregistered inplace local optimizer that has the
same name as the decorated function.
"""
def
wrapper
(
maker
):
@local_optimizer
([
op
],
inplace
=
True
)
@wraps
(
maker
)
def
opt
(
node
):
if
type
(
node
.
op
)
!=
op
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
alloc
=
inputs
[
idx
]
if
(
alloc
.
owner
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
alloc
.
clients
)
>
1
):
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
return
maker
(
node
,
inputs
)
return
opt
return
wrapper
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
645557f9
...
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def
_f16_ok
(
self
):
def
_f16_ok
(
self
):
return
self
.
iadd_node
.
op
.
_f16_ok
return
self
.
iadd_node
.
op
.
_f16_ok
def
c_header_dirs
(
self
):
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_headers
(
self
):
def
c_headers
(
self
):
return
self
.
iadd_node
.
op
.
c_headers
()
return
self
.
iadd_node
.
op
.
c_headers
()
def
c_compiler
(
self
):
return
self
.
iadd_node
.
op
.
c_compiler
()
def
c_init_code
(
self
):
def
c_init_code
(
self
):
return
self
.
iadd_node
.
op
.
c_init_code
()
return
self
.
iadd_node
.
op
.
c_init_code
()
...
@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
...
@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
if
not
parent_version
or
not
elemwise_version
:
if
not
parent_version
or
not
elemwise_version
:
return
return
return
parent_version
+
elemwise_version
+
(
2
,)
return
parent_version
+
elemwise_version
+
(
3
,)
class
GpuAdvancedSubtensor1
(
HideC
,
tensor
.
AdvancedSubtensor1
):
class
GpuAdvancedSubtensor1
(
HideC
,
tensor
.
AdvancedSubtensor1
):
...
...
theano/sandbox/gpuarray/tests/test_basic_ops.py
浏览文件 @
645557f9
import
unittest
import
unittest
from
theano.compat
import
izip
from
theano.compat
import
izip
from
copy
import
copy
,
deepcopy
from
six
import
iteritems
from
six
import
iteritems
...
@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
...
@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file
# Don't import test classes otherwise they get tested as part of the file
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests.test_basic
import
rand
,
safe_make_node
from
theano.tensor.tests.test_basic
import
rand
,
safe_make_node
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests.unittest_tools
import
SkipTest
from
theano.tests.unittest_tools
import
SkipTest
import
theano.sandbox.gpuarray
import
theano.sandbox.gpuarray
from
..type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
..basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
HostFromGpu
,
GpuFromHost
,
GpuReshape
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
GpuContiguous
,
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
from
..subtensor
import
GpuSubtensor
import
theano.sandbox.cuda
as
cuda_ndarray
try
:
from
pygpu
import
gpuarray
except
:
pass
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
raise
SkipTest
(
"pygpu not installed"
)
raise
SkipTest
(
"pygpu not installed"
)
# If you are writing a new test file, don't copy this code, but rather
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
# import stuff from this file (like mode_with_gpu) to reuse it.
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
cuda_ndarray
.
use
.
device_number
:
if
not
cuda_ndarray
.
use
.
device_number
:
# We should not enable all the use like the flag device=gpu,
# We should not enable all the use like the flag device=gpu,
...
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
...
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
raise
SkipTest
(
"pygpu disabled"
)
from
..type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
..basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
gpu_from_cuda
,
cuda_from_gpu
,
HostFromGpu
,
GpuContiguous
,
GpuFromHost
,
GpuReshape
,
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
from
..subtensor
import
GpuSubtensor
from
theano.tests
import
unittest_tools
as
utt
utt
.
seed_rng
()
utt
.
seed_rng
()
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
from
pygpu
import
gpuarray
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
...
@@ -63,22 +61,6 @@ else:
...
@@ -63,22 +61,6 @@ else:
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
def
may_fail
(
msg
,
EClass
):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def
test_decorator
(
f
):
def
wrapper
():
try
:
f
()
except
Exception
as
e
:
if
isinstance
(
e
,
EClass
):
raise
SkipTest
(
msg
,
e
)
raise
wrapper
.
__name__
=
f
.
__name__
return
wrapper
return
test_decorator
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
on_unused_input
=
'raise'
,
name
=
None
):
on_unused_input
=
'raise'
,
name
=
None
):
if
mode
is
None
:
if
mode
is
None
:
...
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
...
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else
:
else
:
err_msg
=
(
"Test
%
s::
%
s: exception raised during test "
err_msg
=
(
"Test
%
s::
%
s: exception raised during test "
"call was not the same as the reference "
"call was not the same as the reference "
"call (got:
%
s, expected
%
s)"
)
%
\
"call (got:
%
s, expected
%
s)"
%
(
self
.
gpu_op
,
testname
,
type
(
exc
),
(
self
.
gpu_op
,
testname
,
type
(
exc
),
type
(
ref_e
))
type
(
ref_e
))
)
exc
.
args
+=
(
err_msg
,)
exc
.
args
+=
(
err_msg
,)
raise
raise
...
@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
...
@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected
):
expected
):
self
.
fail
((
"Test
%
s::
%
s: Output
%
s gave the wrong "
self
.
fail
((
"Test
%
s::
%
s: Output
%
s gave the wrong "
"value. With inputs
%
s, expected
%
s "
"value. With inputs
%
s, expected
%
s "
"(dtype
%
s), got
%
s (dtype
%
s)."
)
%
(
"(dtype
%
s), got
%
s (dtype
%
s)."
%
self
.
op
,
testname
,
i
,
inputs
,
expected
,
(
self
.
op
,
testname
,
i
,
inputs
,
expected
,
expected
.
dtype
,
variable
,
variable
.
dtype
))
expected
.
dtype
,
variable
,
variable
.
dtype
)
))
for
description
,
check
in
iteritems
(
self
.
checks
):
for
description
,
check
in
iteritems
(
self
.
checks
):
if
not
check
(
inputs
,
variables
):
if
not
check
(
inputs
,
variables
):
...
@@ -250,36 +232,6 @@ def test_transfer_strided():
...
@@ -250,36 +232,6 @@ def test_transfer_strided():
assert
numpy
.
all
(
fv
==
av
)
assert
numpy
.
all
(
fv
==
av
)
@may_fail
(
"Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way"
,
ValueError
)
def
test_transfer_cuda_gpu
():
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
"Can't test interaction with cuda if cuda not present"
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
c
=
cuda_ndarray
.
CudaNdarrayType
((
False
,
False
))(
'c'
)
av
=
theano
.
_asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
cv
=
cuda_ndarray
.
CudaNdarray
(
av
)
gvs
=
gv
[:,
::
-
2
]
cvs
=
cv
[:,
::
-
2
]
f
=
theano
.
function
([
c
],
gpu_from_cuda
(
c
))
fv
=
f
(
cv
)
assert
GpuArrayType
.
values_eq_approx
(
fv
,
gv
)
fvs
=
f
(
cvs
)
assert
GpuArrayType
.
values_eq_approx
(
fvs
,
gvs
)
f
=
theano
.
function
([
g
],
cuda_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fv
,
cv
)
fvs
=
f
(
gvs
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fvs
,
cvs
)
def
gpu_alloc_expected
(
x
,
*
shp
):
def
gpu_alloc_expected
(
x
,
*
shp
):
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
)
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
)
g
[:]
=
x
g
[:]
=
x
...
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
...
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op
=
gpu_alloc
,
gpu_op
=
gpu_alloc
,
cases
=
dict
(
cases
=
dict
(
correct01
=
(
rand
(),
numpy
.
int32
(
7
)),
correct01
=
(
rand
(),
numpy
.
int32
(
7
)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# just gives a DeepCopyOp with possibly wrong results on the CPU
#
correct01_bcast=(rand(1), numpy.int32(7)),
#
correct01_bcast=(rand(1), numpy.int32(7)),
correct02
=
(
rand
(),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct02
=
(
rand
(),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct12
=
(
rand
(
7
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct12
=
(
rand
(
7
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct13
=
(
rand
(
7
),
numpy
.
int32
(
2
),
numpy
.
int32
(
4
),
correct13
=
(
rand
(
7
),
numpy
.
int32
(
2
),
numpy
.
int32
(
4
),
...
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
...
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv
=
gpuarray
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
cv
=
gpuarray
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
dtype
=
'float32'
)
gpu_from_host
=
theano
.
sandbox
.
gpuarray
.
basic_ops
.
gpu_from_host
host_from_gpu
=
theano
.
sandbox
.
gpuarray
.
basic_ops
.
host_from_gpu
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
),
mode
=
m
)
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
),
mode
=
m
)
assert
gpu_from_host
in
[
x
.
op
assert
gpu_from_host
in
[
x
.
op
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
...
...
theano/sandbox/gpuarray/tests/test_blas.py
浏览文件 @
645557f9
...
@@ -6,8 +6,7 @@ import numpy
...
@@ -6,8 +6,7 @@ import numpy
import
theano
import
theano
from
theano
import
tensor
from
theano
import
tensor
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.blas
import
(
gemv_inplace
,
gemm_inplace
,
ger_destructive
,
from
theano.tensor.blas
import
gemv_inplace
,
gemm_inplace
,
_dot22
_dot22
)
from
theano.tensor.tests.test_blas
import
TestGer
,
BaseGemv
from
theano.tensor.tests.test_blas
import
TestGer
,
BaseGemv
from
..
import
gpuarray_shared_constructor
from
..
import
gpuarray_shared_constructor
...
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
...
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu
)
mode_with_gpu
)
from
..blas
import
(
gpugemv_inplace
,
gpugemv_no_inplace
,
from
..blas
import
(
gpugemv_inplace
,
gpugemv_no_inplace
,
gpugemm_inplace
,
gpugemm_no_inplace
,
gpugemm_inplace
,
gpuger_inplace
,
gpuger_no_inplace
,
gpuger_inplace
,
gpuger_no_inplace
,
GpuGer
,
gpu_dot22
,
GpuGemm
)
GpuGer
,
gpu_dot22
,
GpuGemm
)
GpuGemvTester
=
makeTester
(
'GpuGemvTester'
,
GpuGemvTester
=
makeTester
(
'GpuGemvTester'
,
op
=
gemv_inplace
,
gpu_op
=
gpugemv_inplace
,
op
=
gemv_inplace
,
gpu_op
=
gpugemv_inplace
,
cases
=
dict
(
cases
=
dict
(
dot_vv
=
[
rand
(
1
),
1
,
rand
(
1
,
2
),
rand
(
2
),
0
],
dot_vv
=
[
rand
(
1
),
1
,
rand
(
1
,
2
),
rand
(
2
),
0
],
dot_vm
=
[
rand
(
3
),
1
,
rand
(
3
,
2
),
rand
(
2
),
0
],
dot_vm
=
[
rand
(
3
),
1
,
rand
(
3
,
2
),
rand
(
2
),
0
],
#
test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
#
test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
#
test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
#
test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
#
test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
#
test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride
=
[
rand
(
3
)[::
-
1
],
1
,
rand
(
3
,
2
)[::
-
1
],
rand
(
2
)[::
-
1
],
0
],
test_stride
=
[
rand
(
3
)[::
-
1
],
1
,
rand
(
3
,
2
)[::
-
1
],
rand
(
2
)[::
-
1
],
0
],
)
)
)
)
class
TestGpuSgemv
(
TestCase
,
BaseGemv
,
utt
.
TestOptimizationMixin
):
class
TestGpuSgemv
(
TestCase
,
BaseGemv
,
utt
.
TestOptimizationMixin
):
...
@@ -48,10 +47,10 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
...
@@ -48,10 +47,10 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return
theano
.
shared
(
val
)
return
theano
.
shared
(
val
)
GpuGemmTester
=
makeTester
(
'GpuGemmTester'
,
GpuGemmTester
=
makeTester
(
'GpuGemmTester'
,
op
=
gemm_inplace
,
gpu_op
=
gpugemm_inplace
,
op
=
gemm_inplace
,
gpu_op
=
gpugemm_inplace
,
cases
=
dict
(
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test1
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test2
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.0
],
test2
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.0
],
test3
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test3
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test4
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test4
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
...
@@ -65,7 +64,7 @@ GpuGemmTester = makeTester('GpuGemmTester',
...
@@ -65,7 +64,7 @@ GpuGemmTester = makeTester('GpuGemmTester',
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
)
)
class
TestGpuSger
(
TestGer
):
class
TestGpuSger
(
TestGer
):
...
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
...
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def
test_f32_0_0
(
self
):
def
test_f32_0_0
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
raise
SkipTest
(
'0-sized objects not supported'
)
def
test_f32_1_0
(
self
):
def
test_f32_1_0
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
raise
SkipTest
(
'0-sized objects not supported'
)
def
test_f32_0_1
(
self
):
def
test_f32_0_1
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
raise
SkipTest
(
'0-sized objects not supported'
)
...
@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
...
@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester
=
makeTester
(
GpuDot22Tester
=
makeTester
(
'Gpu
Gemm
Tester'
,
'Gpu
Dot22
Tester'
,
op
=
_dot22
,
gpu_op
=
gpu_dot22
,
op
=
_dot22
,
gpu_op
=
gpu_dot22
,
cases
=
dict
(
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
rand
(
4
,
5
)],
test1
=
[
rand
(
3
,
4
),
rand
(
4
,
5
)],
test2
=
[
rand
(
1
,
4
),
rand
(
4
,
5
)],
test2
=
[
rand
(
1
,
4
),
rand
(
4
,
5
)],
test3
=
[
rand
(
3
,
1
),
rand
(
1
,
5
)],
test3
=
[
rand
(
3
,
1
),
rand
(
1
,
5
)],
test4
=
[
rand
(
3
,
4
),
rand
(
4
,
1
)],
test4
=
[
rand
(
3
,
4
),
rand
(
4
,
1
)],
#
test5=[rand(0, 4), rand(4, 5)],
#
test5=[rand(0, 4), rand(4, 5)],
#
test6=[rand(3, 0), rand(0, 5)],
#
test6=[rand(3, 0), rand(0, 5)],
#
test7=[rand(3, 4), rand(4, 0)],
#
test7=[rand(3, 4), rand(4, 0)],
#
test8=[rand(0, 4), rand(4, 0)],
#
test8=[rand(0, 4), rand(4, 0)],
#
test9=[rand(0, 0), rand(0, 0)],
#
test9=[rand(0, 0), rand(0, 0)],
)
)
)
)
def
test_hgemm_swap
():
def
test_hgemm_swap
():
from
theano.sandbox.cuda
import
nvcc_compiler
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
...
@@ -149,6 +151,7 @@ def test_hgemm_swap():
...
@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt
.
assert_allclose
(
of
,
on
)
utt
.
assert_allclose
(
of
,
on
)
def
test_hgemm_alpha_output_merge
():
def
test_hgemm_alpha_output_merge
():
from
theano.sandbox.cuda
import
nvcc_compiler
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
...
...
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
浏览文件 @
645557f9
...
@@ -6,32 +6,31 @@ import sys
...
@@ -6,32 +6,31 @@ import sys
import
time
import
time
import
unittest
import
unittest
import
numpy
import
numpy
from
six.moves
import
xrange
from
six.moves
import
xrange
from
nose.plugins.skip
import
SkipTest
imported_scipy_convolve2d
=
False
try
:
from
scipy.signal
import
convolve2d
imported_scipy_convolve2d
=
True
except
ImportError
:
pass
import
theano
import
theano
from
theano
import
tensor
from
theano
import
tensor
from
theano.tests.unittest_tools
import
seed_rng
from
theano.tests.unittest_tools
import
seed_rng
# We let that import do the init of the back-end if needed.
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
from
.test_basic_ops
import
mode_with_gpu
mode_without_gpu
)
from
..type
import
GpuArrayType
from
..type
import
GpuArrayType
from
..conv
import
GpuConv
from
..conv
import
GpuConv
from
theano.sandbox.gpuarray
import
dnn
from
theano.sandbox.gpuarray
import
dnn
import
pygpu
import
pygpu
imported_scipy_convolve2d
=
False
try
:
from
scipy.signal
import
convolve2d
imported_scipy_convolve2d
=
True
except
ImportError
:
pass
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
def
py_conv_valid_numpy
(
img
,
kern
):
def
py_conv_valid_numpy
(
img
,
kern
):
assert
img
.
shape
[
1
]
==
kern
.
shape
[
1
]
assert
img
.
shape
[
1
]
==
kern
.
shape
[
1
]
outshp
=
(
img
.
shape
[
0
],
kern
.
shape
[
0
],
outshp
=
(
img
.
shape
[
0
],
kern
.
shape
[
0
],
...
@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
cpu_mflops
=
approx_fp
/
(
t1
-
t0
)
cpu_mflops
=
approx_fp
/
(
t1
-
t0
)
gpu_mflops
=
approx_fp
/
(
t2
-
t1
)
gpu_mflops
=
approx_fp
/
(
t2
-
t1
)
if
verbose
>
0
:
if
verbose
>
0
:
print
(
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
end
=
' '
,
file
=
sys
.
stdout
)
print
(
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
end
=
' '
,
print
(
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
file
=
sys
.
stdout
)
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
)),
file
=
sys
.
stdout
)
print
(
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
)),
file
=
sys
.
stdout
)
if
not
rval
:
if
not
rval
:
print
(
(
'test_'
+
mode
+
' id='
+
str
(
id
)
+
print
(
'test_'
+
mode
+
' id='
+
str
(
id
)
+
' FAILED for ishape, kshape, mode, subsample,'
+
' FAILED for ishape, kshape, mode, subsample,'
+
' img_stride, kern_stride, version'
,
ishape
,
' img_stride, kern_stride, version'
,
ishape
,
kshape
,
mode
,
subsample
,
img_stride
,
kern_stride
,
kshape
,
mode
,
subsample
,
img_stride
,
kern_stride
,
version
)
,
file
=
sys
.
stdout
)
version
,
file
=
sys
.
stdout
)
diff
=
cpuval
-
gpuval
diff
=
cpuval
-
gpuval
diffabs
=
numpy
.
absolute
(
diff
)
diffabs
=
numpy
.
absolute
(
diff
)
pr_diff
=
diffabs
/
numpy
.
absolute
(
cpuval
)
pr_diff
=
diffabs
/
numpy
.
absolute
(
cpuval
)
...
@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
nb_close
,
"/"
,
diff
.
size
))
nb_close
,
"/"
,
diff
.
size
))
print
(
"max relatif diff:"
,
(
pr_diff
.
max
(),
"avg rel diff:"
,
print
(
"max relatif diff:"
,
(
pr_diff
.
max
(),
"avg rel diff:"
,
numpy
.
average
(
pr_diff
)))
numpy
.
average
(
pr_diff
)))
if
not
rval
and
print_
!=
False
:
if
not
rval
and
print_
is
not
False
:
if
npy_img
.
shape
[
0
]
>
5
:
if
npy_img
.
shape
[
0
]
>
5
:
print
(
"img"
,
npy_img
[
0
])
print
(
"img"
,
npy_img
[
0
])
print
(
"kern"
,
npy_kern
[
0
])
print
(
"kern"
,
npy_kern
[
0
])
...
@@ -242,7 +243,8 @@ def exec_conv(version, shapes, verbose, random, mode,
...
@@ -242,7 +243,8 @@ def exec_conv(version, shapes, verbose, random, mode,
istride
,
kstride
)
in
enumerate
(
shapes
):
istride
,
kstride
)
in
enumerate
(
shapes
):
ret
=
False
ret
=
False
try
:
try
:
ret
=
_params_allgood
(
ishape
,
ret
=
_params_allgood
(
ishape
,
kshape
,
kshape
,
mode
,
mode
,
subsample
=
subshape
,
subsample
=
subshape
,
...
@@ -297,15 +299,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
...
@@ -297,15 +299,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((
3
,
1
)
+
imshp
,
(
1
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
1
)
+
imshp
,
(
1
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
# nkern only
# nkern only
((
1
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
1
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and nkern
#
batch and nkern
((
3
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and stack
#
batch and stack
((
3
,
2
)
+
imshp
,
(
1
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
3
,
2
)
+
imshp
,
(
1
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#stack and nkern
#
stack and nkern
((
1
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
1
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
2
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
((
2
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
3
,
2
)
+
imshp
,
(
4
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
)
((
3
,
2
)
+
imshp
,
(
4
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
)
]
]
...
@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
...
@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
def
get_valid_shapes
():
def
get_valid_shapes
():
# img shape, kern shape, subsample shape
# img shape, kern shape, subsample shape
shapes
=
get_basic_shapes
()
shapes
=
get_basic_shapes
()
...
@@ -361,37 +362,34 @@ def get_valid_shapes():
...
@@ -361,37 +362,34 @@ def get_valid_shapes():
shapes
+=
[
shapes
+=
[
# other test
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# layer 1 backprop to weights
,
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# added a test case that fail from test_nnet.py.test_conv_nnet2
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# added a test case that fail from test_nnet.py.test_conv_nnet2
,
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# test precedent but reduced that triger the error
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# test precedent but reduced that triger the error
# Test more than maxThreadsDim0
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
]
shapes
+=
[
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 1 layers
shapes
+=
[((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 1 layers
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 2 layers
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 2 layers
,
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
,
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 2 valid
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 1 layers
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 1 layers
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 2 layers
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 2 layers
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 full
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
]
]
return
shapes
return
shapes
...
@@ -428,42 +426,34 @@ def test_full():
...
@@ -428,42 +426,34 @@ def test_full():
shapes
+=
[
shapes
+=
[
# other test
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# layer 1 backprop to weights
# other test
# other test
,
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# kernel bigger then image
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# kernel bigger then image
,
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# kernel bigger then image
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# kernel bigger then image
,
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
]
shapes
+=
[
shapes
+=
[
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
# Test more than maxThreadsDim0
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
]
version
=
[
-
1
]
version
=
[
-
1
]
...
@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
for
mode
in
[
'valid'
,
'full'
]:
for
mode
in
[
'valid'
,
'full'
]:
for
shapes
in
[((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
8
,
8
)),
for
shapes
in
[((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
8
,
8
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
5
,
8
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
5
,
8
)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns.
# We use only the number of columns.
]:
]:
...
@@ -580,47 +569,45 @@ def benchmark():
...
@@ -580,47 +569,45 @@ def benchmark():
shapes_valid
=
[
shapes_valid
=
[
# test_lenet_28 shape
# test_lenet_28 shape
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
# valid
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_32 shape
# test_lenet_32 shape
,
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_64 shape
# test_lenet_64 shape
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_108 shape
# test_lenet_108 shape
,
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_256 shape
# test_lenet_256 shape
,
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
,
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
]
]
shapes_full
=
[
shapes_full
=
[
# test_lenet_28 shape
# test_lenet_28 shape
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full
# test_lenet_32 shape
# test_lenet_32 shape
,
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=1
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape
# test_lenet_64 shape
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=3
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape
# test_lenet_108 shape
,
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full 'conv_full_patch_stack_padded' N=9
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape
# test_lenet_256 shape
,
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_reference_full
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_reference_full
]
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version
=
[
-
1
]
version
=
[
-
1
]
verbose
=
1
verbose
=
1
random
=
True
random
=
True
...
...
theano/sandbox/gpuarray/tests/test_neighbours.py
浏览文件 @
645557f9
import
unittest
from
theano.tensor.nnet.tests
import
test_neighbours
from
theano.tensor.nnet.tests
import
test_neighbours
# We let that import do the init of the back-end if needed.
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
from
.test_basic_ops
import
mode_with_gpu
mode_without_gpu
)
from
..neighbours
import
GpuImages2Neibs
from
..neighbours
import
GpuImages2Neibs
...
...
theano/sandbox/gpuarray/tests/test_nnet.py
浏览文件 @
645557f9
from
__future__
import
print_function
from
__future__
import
print_function
from
nose.plugins.skip
import
SkipTest
import
numpy
import
numpy
import
unittest
import
unittest
...
@@ -7,8 +7,6 @@ import theano
...
@@ -7,8 +7,6 @@ import theano
import
theano.tensor
as
T
import
theano.tensor
as
T
import
theano.tests.unittest_tools
as
utt
import
theano.tests.unittest_tools
as
utt
from
theano.sandbox
import
gpuarray
# We let that import do the init of the back-end if needed.
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
from
.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
mode_without_gpu
)
...
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
...
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in
=
4098
n_in
=
4098
n_out
=
4099
n_out
=
4099
x
=
T
.
fmatrix
(
'x'
)
y
=
T
.
lvector
(
'y'
)
y
=
T
.
lvector
(
'y'
)
b
=
T
.
fvector
(
'b'
)
b
=
T
.
fvector
(
'b'
)
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#
(the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# powerful enough. We need the big shape to check for corner
# case.
# case.
dot_result
=
T
.
fmatrix
(
'dot_result'
)
dot_result
=
T
.
fmatrix
(
'dot_result'
)
...
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
...
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
batch_size
,
n_in
),
xx
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
batch_size
,
n_in
),
dtype
=
numpy
.
float32
)
dtype
=
numpy
.
float32
)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy
=
numpy
.
ones
((
batch_size
,),
dtype
=
'int32'
)
yy
=
numpy
.
ones
((
batch_size
,),
dtype
=
'int32'
)
b_values
=
numpy
.
zeros
((
n_out
,),
dtype
=
'float32'
)
b_values
=
numpy
.
zeros
((
n_out
,),
dtype
=
'float32'
)
W_values
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
n_in
,
n_out
),
dtype
=
'float32'
)
W_values
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
n_in
,
n_out
),
dtype
=
'float32'
)
...
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
...
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu
=
theano
.
function
(
inputs
=
[
y
,
b
,
dot_result
],
classify_gpu
=
theano
.
function
(
inputs
=
[
y
,
b
,
dot_result
],
outputs
=
[
loss
,
y_pred
,
dW
],
outputs
=
[
loss
,
y_pred
,
dW
],
mode
=
mode_with_gpu
)
mode
=
mode_with_gpu
)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert
any
([
isinstance
(
node
.
op
,
assert
any
([
isinstance
(
node
.
op
,
T
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
)
T
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
)
...
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
...
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads
We check that we loop when their is too much threads
"""
"""
n_in
=
1000
batch_size
=
4097
batch_size
=
4097
n_out
=
1250
n_out
=
1250
if
not
isinstance
(
mode_with_gpu
,
theano
.
compile
.
DebugMode
):
if
not
isinstance
(
mode_with_gpu
,
theano
.
compile
.
DebugMode
):
n_in
=
4098
n_out
=
4099
n_out
=
4099
# Seed numpy.random with config.unittests.rseed
# Seed numpy.random with config.unittests.rseed
...
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
...
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol
=
1e-5
rtol
=
1e-5
atol
=
1e-6
atol
=
1e-6
if
not
numpy
.
allclose
(
cpu_out
,
gpu_out
,
rtol
=
rtol
,
atol
=
atol
):
utt
.
assert_allclose
(
cpu_out
,
gpu_out
,
rtol
=
rtol
,
atol
=
atol
)
abs_err
,
rel_err
=
T
.
numeric_grad
.
abs_rel_err
(
cpu_out
,
gpu_out
)
scaled_err
=
numpy
.
minimum
(
abs_err
/
atol
,
rel_err
/
rtol
)
max_i
=
scaled_err
.
argmax
()
print
(
'max err index:'
,
max_i
,
max_i
/
batch_size
,
end
=
' '
)
print
(
max_i
%
batch_size
,
max_i
/
n_out
,
max_i
&
n_out
)
print
(
'At that index:'
)
print
(
'err:'
,
scaled_err
.
flatten
()[
max_i
])
print
(
'absolute error:'
,
abs_err
.
flatten
()[
max_i
])
print
(
'relative error:'
,
rel_err
.
flatten
()[
max_i
])
print
(
'cpu_out:'
,
cpu_out
.
flatten
()[
max_i
])
print
(
'gpu_out:'
,
gpu_out
.
flatten
()[
max_i
])
print
(
'softmax_output_value:'
,
softmax_output_value
.
flatten
()[
max_i
])
print
(
'dnll_value:'
,
dnll_value
[
max_i
/
n_out
])
print
(
'y_idx_value:'
,
y_idx_value
[
max_i
/
n_out
])
assert
False
,
"numpy.allclose(cpu_out, gpu_out, rtol=
%
s, atol=
%
s)"
%
(
rtol
,
atol
)
def
test_softmax_with_bias_float16
():
def
test_softmax_with_bias_float16
():
...
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
...
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
dtypeBias
=
'float16'
)
dtypeBias
=
'float16'
)
def
test_softmax_with_bias_float32
():
def
test_softmax_with_bias_float32
():
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
dtypeBias
=
'float32'
)
dtypeBias
=
'float32'
)
...
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
...
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS
TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED)
NOT IMPLEMENTED)
"""
"""
x
=
T
.
matrix
(
'x'
,
dtype
=
dtypeInput
)
x
=
T
.
matrix
(
'x'
,
dtype
=
dtypeInput
)
b
=
T
.
vector
(
'b'
,
dtype
=
dtypeBias
)
b
=
T
.
vector
(
'b'
,
dtype
=
dtypeBias
)
...
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
...
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def
test_softmax_float16
():
def
test_softmax_float16
():
softmax_unittest_template
(
'float16'
)
softmax_unittest_template
(
'float16'
)
def
test_softmax_float32
():
def
test_softmax_float32
():
softmax_unittest_template
(
'float32'
)
softmax_unittest_template
(
'float32'
)
def
test_softmax_float64
():
def
test_softmax_float64
():
softmax_unittest_template
(
'float64'
)
softmax_unittest_template
(
'float64'
)
...
...
theano/sandbox/gpuarray/tests/test_type.py
浏览文件 @
645557f9
import
operator
import
numpy
import
numpy
import
theano
import
theano
...
@@ -25,7 +23,6 @@ def test_deep_copy():
...
@@ -25,7 +23,6 @@ def test_deep_copy():
def
test_values_eq_approx
():
def
test_values_eq_approx
():
a
=
rand_gpuarray
(
20
,
dtype
=
'float32'
)
a
=
rand_gpuarray
(
20
,
dtype
=
'float32'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,))(
'g'
)
assert
GpuArrayType
.
values_eq_approx
(
a
,
a
)
assert
GpuArrayType
.
values_eq_approx
(
a
,
a
)
b
=
a
.
copy
()
b
=
a
.
copy
()
b
[
0
]
=
numpy
.
asarray
(
b
[
0
])
+
1.
b
[
0
]
=
numpy
.
asarray
(
b
[
0
])
+
1.
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
645557f9
...
@@ -200,11 +200,12 @@ class GpuArrayType(Type):
...
@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self
.
broadcastable
==
other
.
broadcastable
)
self
.
broadcastable
==
other
.
broadcastable
)
def
convert_variable
(
self
,
var
):
def
convert_variable
(
self
,
var
):
if
(
type
(
self
)
==
type
(
var
.
type
)
and
vt
=
var
.
type
self
.
typecode
==
var
.
type
.
typecode
and
if
(
type
(
self
)
==
type
(
vt
)
and
self
.
ndim
==
var
.
type
.
ndim
and
self
.
typecode
==
vt
.
typecode
and
self
.
ndim
==
vt
.
ndim
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
v
ar
.
type
.
broadcastable
))):
v
t
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
def
__hash__
(
self
):
def
__hash__
(
self
):
...
...
theano/tests/test_flake8.py
浏览文件 @
645557f9
...
@@ -157,24 +157,11 @@ whitelist_flake8 = [
...
@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py"
,
"sandbox/linalg/ops.py"
,
"sandbox/linalg/__init__.py"
,
"sandbox/linalg/__init__.py"
,
"sandbox/linalg/tests/test_linalg.py"
,
"sandbox/linalg/tests/test_linalg.py"
,
"sandbox/gpuarray/basic_ops.py"
,
"sandbox/gpuarray/nnet.py"
,
"sandbox/gpuarray/elemwise.py"
,
"sandbox/gpuarray/type.py"
,
"sandbox/gpuarray/__init__.py"
,
"sandbox/gpuarray/__init__.py"
,
"sandbox/gpuarray/kernel_codegen.py"
,
"sandbox/gpuarray/conv.py"
,
"sandbox/gpuarray/neighbours.py"
,
"sandbox/gpuarray/tests/test_subtensor.py"
,
"sandbox/gpuarray/tests/test_subtensor.py"
,
"sandbox/gpuarray/tests/test_scan.py"
,
"sandbox/gpuarray/tests/test_scan.py"
,
"sandbox/gpuarray/tests/test_neighbours.py"
,
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py"
,
"sandbox/gpuarray/tests/test_type.py"
,
"sandbox/gpuarray/tests/test_opt.py"
,
"sandbox/gpuarray/tests/test_opt.py"
,
"sandbox/gpuarray/tests/test_blas.py"
,
"sandbox/gpuarray/tests/test_elemwise.py"
,
"sandbox/gpuarray/tests/test_elemwise.py"
,
"sandbox/gpuarray/tests/test_nnet.py"
,
"sandbox/gpuarray/tests/test_basic_ops.py"
,
"scan_module/scan_utils.py"
,
"scan_module/scan_utils.py"
,
"scan_module/scan_views.py"
,
"scan_module/scan_views.py"
,
"scan_module/scan.py"
,
"scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论