Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
645557f9
提交
645557f9
authored
10月 02, 2015
作者:
Pascal Lamblin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3476 from abergeron/move_config
Multiple fixes preparing for multi-gpu
上级
1ec1cd9b
71dea2cf
隐藏空白字符变更
内嵌
并排
正在显示
29 个修改的文件
包含
861 行增加
和
1153 行删除
+861
-1153
setup.cfg
setup.cfg
+3
-0
__init__.py
theano/__init__.py
+4
-2
configdefaults.py
theano/configdefaults.py
+112
-19
configparser.py
theano/configparser.py
+11
-7
op.py
theano/gof/op.py
+8
-4
__init__.py
theano/sandbox/cuda/__init__.py
+1
-1
dnn.py
theano/sandbox/cuda/dnn.py
+6
-6
nvcc_compiler.py
theano/sandbox/cuda/nvcc_compiler.py
+4
-71
dnn_flags.py
theano/sandbox/dnn_flags.py
+0
-42
__init__.py
theano/sandbox/gpuarray/__init__.py
+7
-12
basic_ops.py
theano/sandbox/gpuarray/basic_ops.py
+39
-269
conv.py
theano/sandbox/gpuarray/conv.py
+19
-39
dnn.py
theano/sandbox/gpuarray/dnn.py
+21
-46
elemwise.py
theano/sandbox/gpuarray/elemwise.py
+60
-94
kernel_codegen.py
theano/sandbox/gpuarray/kernel_codegen.py
+11
-12
neighbours.py
theano/sandbox/gpuarray/neighbours.py
+4
-26
nerv.py
theano/sandbox/gpuarray/nerv.py
+2
-2
nnet.py
theano/sandbox/gpuarray/nnet.py
+110
-151
opt.py
theano/sandbox/gpuarray/opt.py
+2
-2
opt_util.py
theano/sandbox/gpuarray/opt_util.py
+219
-20
subtensor.py
theano/sandbox/gpuarray/subtensor.py
+1
-11
test_basic_ops.py
theano/sandbox/gpuarray/tests/test_basic_ops.py
+23
-73
test_blas.py
theano/sandbox/gpuarray/tests/test_blas.py
+41
-38
test_conv_cuda_ndarray.py
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+139
-152
test_neighbours.py
theano/sandbox/gpuarray/tests/test_neighbours.py
+1
-3
test_nnet.py
theano/sandbox/gpuarray/tests/test_nnet.py
+7
-30
test_type.py
theano/sandbox/gpuarray/tests/test_type.py
+0
-3
type.py
theano/sandbox/gpuarray/type.py
+6
-5
test_flake8.py
theano/tests/test_flake8.py
+0
-13
没有找到文件。
setup.cfg
浏览文件 @
645557f9
[nosetest]
match=^test
nocapture=1
[flake8]
ignore=E501,E123,E133
theano/__init__.py
浏览文件 @
645557f9
...
...
@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano
.
sandbox
.
cuda
.
tests
.
test_driver
.
test_nvidia_driver1
()
if
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
\
config
.
gpuarray
.
init_device
!=
''
:
if
(
config
.
device
.
startswith
(
'cuda'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
import
theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
...
...
theano/configdefaults.py
浏览文件 @
645557f9
...
...
@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self
.
default
=
default
def
filter
(
val
):
if
val
.
startswith
(
'cpu'
)
or
val
.
startswith
(
'gpu'
)
\
if
val
==
self
.
default
or
val
.
startswith
(
'gpu'
)
\
or
val
.
startswith
(
'opencl'
)
or
val
.
startswith
(
'cuda'
):
return
val
else
:
raise
ValueError
((
'Invalid value ("
%
s") for configuration '
'variable "
%
s". Valid options start with '
'one of "
cpu
", "gpu", "opencl", "cuda"'
%
(
val
,
self
.
fullname
)))
'one of "
%
s
", "gpu", "opencl", "cuda"'
%
(
self
.
default
,
val
,
self
.
fullname
)))
over
=
kwargs
.
get
(
"allow_override"
,
True
)
super
(
DeviceParam
,
self
)
.
__init__
(
default
,
filter
,
over
)
def
__str__
(
self
):
return
'
%
s (
cpu, gpu*, opencl*, cuda*) '
%
(
self
.
fullname
,
)
return
'
%
s (
%
s, gpu*, opencl*, cuda*) '
%
(
self
.
fullname
,
self
.
default
)
AddConfigVar
(
'device'
,
...
...
@@ -94,15 +94,7 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."
),
DeviceParam
(
'cpu'
,
allow_override
=
False
),
in_c_key
=
False
,)
AddConfigVar
(
'gpuarray.init_device'
,
"""
Device to initialize for gpuarray use without moving
computations automatically.
"""
,
StrParam
(
''
),
in_c_key
=
False
)
in_c_key
=
False
)
AddConfigVar
(
'init_gpu_device'
,
...
...
@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."
),
EnumStr
(
''
,
'gpu'
,
'gpu0'
,
'gpu1'
,
'gpu2'
,
'gpu3'
,
'gpu4'
,
'gpu5'
,
'gpu6'
,
'gpu7'
,
'gpu8'
,
'gpu9'
,
'gpu10'
,
'gpu11'
,
'gpu12'
,
'gpu13'
,
'gpu14'
,
'gpu15'
,
allow_override
=
False
),
DeviceParam
(
''
,
allow_override
=
False
),
in_c_key
=
False
)
AddConfigVar
(
...
...
@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key
=
False
)
def
default_cuda_root
():
v
=
os
.
getenv
(
'CUDA_ROOT'
,
""
)
if
v
:
return
v
s
=
os
.
getenv
(
"PATH"
)
if
not
s
:
return
''
for
dir
in
s
.
split
(
os
.
path
.
pathsep
):
if
os
.
path
.
exists
(
os
.
path
.
join
(
dir
,
"nvcc"
)):
return
os
.
path
.
split
(
dir
)[
0
]
return
''
AddConfigVar
(
'cuda.root'
,
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
"""
,
StrParam
(
default_cuda_root
),
in_c_key
=
False
)
def
filter_nvcc_flags
(
s
):
assert
isinstance
(
s
,
str
)
flags
=
[
flag
for
flag
in
s
.
split
(
' '
)
if
flag
]
if
any
([
f
for
f
in
flags
if
not
f
.
startswith
(
"-"
)]):
raise
ValueError
(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '
%
s'"
%
s
)
return
' '
.
join
(
flags
)
AddConfigVar
(
'nvcc.flags'
,
"Extra compiler flags for nvcc"
,
ConfigParam
(
""
,
filter_nvcc_flags
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'nvcc.compiler_bindir'
,
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory"
,
StrParam
(
""
),
in_c_key
=
False
)
AddConfigVar
(
'nvcc.fastmath'
,
""
,
BoolParam
(
False
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'gpuarray.sync'
,
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling."""
,
BoolParam
(
False
),
in_c_key
=
True
)
AddConfigVar
(
'dnn.conv.workmem'
,
"This flag is deprecated; use dnn.conv.algo_fwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.workmem_bwd'
,
"This flag is deprecated; use dnn.conv.algo_bwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_fwd'
,
"Default implementation to use for CuDNN forward convolution."
,
EnumStr
(
'small'
,
'none'
,
'large'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_bwd'
,
"Default implementation to use for CuDNN backward convolution."
,
EnumStr
(
'none'
,
'deterministic'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
def
default_dnn_path
(
suffix
):
def
f
(
suffix
=
suffix
):
if
config
.
cuda
.
root
==
''
:
return
''
return
os
.
path
.
join
(
config
.
cuda
.
root
,
suffix
)
return
f
AddConfigVar
(
'dnn.include_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
default_dnn_path
(
'include'
)))
AddConfigVar
(
'dnn.library_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
default_dnn_path
(
'lib64'
)))
# This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph.
AddConfigVar
(
...
...
theano/configparser.py
浏览文件 @
645557f9
...
...
@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
if
v
.
fullname
==
k
]
assert
len
(
l
)
==
1
old_val
[
k
]
=
l
[
0
]
.
__get__
()
old_val
[
k
]
=
l
[
0
]
.
__get__
(
True
,
None
)
try
:
for
k
in
kwargs
:
l
=
[
v
for
v
in
theano
.
configparser
.
_config_var_list
...
...
@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for
cv
in
_config_var_list
:
print
(
cv
,
file
=
buf
)
print
(
" Doc: "
,
cv
.
doc
,
file
=
buf
)
print
(
" Value: "
,
cv
.
__get__
(),
file
=
buf
)
print
(
" Value: "
,
cv
.
__get__
(
True
,
None
),
file
=
buf
)
print
(
""
,
file
=
buf
)
...
...
@@ -182,7 +182,7 @@ def get_config_md5():
all_opts
=
sorted
([
c
for
c
in
_config_var_list
if
c
.
in_c_key
],
key
=
lambda
cv
:
cv
.
fullname
)
return
theano
.
gof
.
utils
.
hash_from_code
(
'
\n
'
.
join
(
[
'
%
s =
%
s'
%
(
cv
.
fullname
,
cv
.
__get__
())
for
cv
in
all_opts
]))
[
'
%
s =
%
s'
%
(
cv
.
fullname
,
cv
.
__get__
(
True
,
None
))
for
cv
in
all_opts
]))
class
TheanoConfigParser
(
object
):
...
...
@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user.
if
not
callable
(
configparam
.
default
):
configparam
.
__get__
()
configparam
.
__get__
(
root
,
type
(
root
)
)
else
:
# We do not want to evaluate now the default value
# when it is a callable.
try
:
fetch_val_for_key
(
configparam
.
fullname
)
# The user provided a value, filter it now.
configparam
.
__get__
()
configparam
.
__get__
(
root
,
type
(
root
)
)
except
KeyError
:
pass
setattr
(
root
.
__class__
,
sections
[
0
],
configparam
)
...
...
@@ -294,6 +294,7 @@ class ConfigParam(object):
self
.
default
=
default
self
.
filter
=
filter
self
.
allow_override
=
allow_override
self
.
is_default
=
True
# N.B. --
# self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar
...
...
@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects.
def
__get__
(
self
,
*
args
):
def
__get__
(
self
,
cls
,
type_
):
if
cls
is
None
:
return
self
if
not
hasattr
(
self
,
'val'
):
try
:
val_str
=
fetch_val_for_key
(
self
.
fullname
)
self
.
is_default
=
False
except
KeyError
:
if
callable
(
self
.
default
):
val_str
=
self
.
default
()
else
:
val_str
=
self
.
default
self
.
__set__
(
None
,
val_str
)
self
.
__set__
(
cls
,
val_str
)
# print "RVAL", self.val
return
self
.
val
...
...
theano/gof/op.py
浏览文件 @
645557f9
...
...
@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code
=
self
.
code_sections
[
tag
]
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
return
os
.
linesep
.
join
([
define_macros
,
code
,
return
os
.
linesep
.
join
([
''
,
define_macros
,
code
,
undef_macros
])
else
:
raise
utils
.
MethodNotDefined
(
...
...
@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
)
def_sub
,
undef_sub
=
self
.
get_sub_macros
(
sub
)
return
os
.
linesep
.
join
([
def_macros
,
def_sub
,
return
os
.
linesep
.
join
([
''
,
def_macros
,
def_sub
,
op_code
,
undef_sub
,
undef_macros
])
else
:
...
...
@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros
,
undef_macros
=
self
.
get_c_macros
(
node
,
name
,
check_input
=
False
)
ctx
=
""
if
'context'
in
sub
:
ctx
=
",
%
s"
%
(
sub
[
'context'
],)
# Generate the C code
return
"""
%(define_macros)
s
{
if (
%(func_name)
s(
%(func_args)
s) != 0) {
if (
%(func_name)
s(
%(func_args)
s
%(ctx)
s
) != 0) {
%(fail)
s
}
}
%(undef_macros)
s
"""
%
dict
(
func_name
=
self
.
func_name
,
fail
=
sub
[
'fail'
],
fail
=
sub
[
'fail'
],
ctx
=
ctx
,
func_args
=
self
.
format_c_function_args
(
inp
,
out
),
define_macros
=
define_macros
,
undef_macros
=
undef_macros
)
...
...
theano/sandbox/cuda/__init__.py
浏览文件 @
645557f9
...
...
@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py
if
config
.
device
.
startswith
(
'gpu'
):
use
(
device
=
config
.
device
,
force
=
config
.
force_device
,
test_driver
=
False
)
elif
config
.
init_gpu_device
:
elif
config
.
init_gpu_device
.
startswith
(
'gpu'
)
:
assert
config
.
device
==
"cpu"
,
(
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'"
)
...
...
theano/sandbox/cuda/dnn.py
浏览文件 @
645557f9
...
...
@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
import
theano.sandbox.dnn_flags
def
dnn_available
():
if
dnn_available
.
avail
is
None
:
...
...
@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params
=
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
)]
if
config
.
dnn
.
include_path
:
params
.
append
(
"-I"
+
config
.
dnn
.
include_path
)
if
config
.
dnn
.
library_path
:
params
.
append
(
"-L"
+
config
.
dnn
.
library_path
)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp
,
out
,
err
=
NVCC_compiler
.
try_flags
(
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
),
"-I"
+
config
.
dnn
.
include_path
,
"-L"
+
config
.
dnn
.
library_path
],
preambule
=
preambule
,
body
=
body
,
params
=
params
,
preambule
=
preambule
,
body
=
body
,
try_run
=
False
,
output
=
True
)
dnn_available
.
avail
=
comp
...
...
theano/sandbox/cuda/nvcc_compiler.py
浏览文件 @
645557f9
...
...
@@ -8,6 +8,7 @@ import warnings
import
numpy
from
theano
import
config
from
theano.compat
import
decode
,
decode_iter
from
theano.gof
import
local_bitwidth
from
theano.gof.utils
import
hash_from_file
...
...
@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger
=
logging
.
getLogger
(
"theano.sandbox.cuda.nvcc_compiler"
)
from
theano.configparser
import
(
config
,
AddConfigVar
,
StrParam
,
BoolParam
,
ConfigParam
)
AddConfigVar
(
'nvcc.compiler_bindir'
,
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory"
,
StrParam
(
""
),
in_c_key
=
False
)
user_provided_cuda_root
=
True
def
default_cuda_root
():
global
user_provided_cuda_root
v
=
os
.
getenv
(
'CUDA_ROOT'
,
""
)
user_provided_cuda_root
=
False
if
v
:
return
v
return
find_cuda_root
()
AddConfigVar
(
'cuda.root'
,
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
"""
,
StrParam
(
default_cuda_root
),
in_c_key
=
False
)
def
filter_nvcc_flags
(
s
):
assert
isinstance
(
s
,
str
)
flags
=
[
flag
for
flag
in
s
.
split
(
' '
)
if
flag
]
if
any
([
f
for
f
in
flags
if
not
f
.
startswith
(
"-"
)]):
raise
ValueError
(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '
%
s'"
%
s
)
return
' '
.
join
(
flags
)
AddConfigVar
(
'nvcc.flags'
,
"Extra compiler flags for nvcc"
,
ConfigParam
(
""
,
filter_nvcc_flags
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
AddConfigVar
(
'nvcc.fastmath'
,
""
,
BoolParam
(
False
),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key
=
False
)
nvcc_path
=
'nvcc'
nvcc_version
=
None
...
...
@@ -115,14 +55,6 @@ def is_nvcc_available():
return
False
def
find_cuda_root
():
s
=
os
.
getenv
(
"PATH"
)
if
not
s
:
return
for
dir
in
s
.
split
(
os
.
path
.
pathsep
):
if
os
.
path
.
exists
(
os
.
path
.
join
(
dir
,
"nvcc"
)):
return
os
.
path
.
split
(
dir
)[
0
]
rpath_defaults
=
[]
...
...
@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
...
...
@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details.
if
(
user_provided_cuda_root
and
if
(
not
type
(
config
.
cuda
)
.
root
.
is_default
and
os
.
path
.
exists
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))):
rpaths
.
append
(
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib'
))
...
...
theano/sandbox/dnn_flags.py
deleted
100644 → 0
浏览文件 @
1ec1cd9b
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import
os.path
from
theano.configparser
import
AddConfigVar
,
EnumStr
,
StrParam
from
theano
import
config
AddConfigVar
(
'dnn.conv.workmem'
,
"This flag is deprecated; use dnn.conv.algo_fwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.workmem_bwd'
,
"This flag is deprecated; use dnn.conv.algo_bwd."
,
EnumStr
(
''
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_fwd'
,
"Default implementation to use for CuDNN forward convolution."
,
EnumStr
(
'small'
,
'none'
,
'large'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.conv.algo_bwd'
,
"Default implementation to use for CuDNN backward convolution."
,
EnumStr
(
'none'
,
'deterministic'
,
'fft'
,
'guess_once'
,
'guess_on_shape_change'
,
'time_once'
,
'time_on_shape_change'
),
in_c_key
=
False
)
AddConfigVar
(
'dnn.include_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
lambda
:
os
.
path
.
join
(
config
.
cuda
.
root
,
'include'
)))
AddConfigVar
(
'dnn.library_path'
,
"Location of the cudnn header (defaults to the cuda root)"
,
StrParam
(
lambda
:
os
.
path
.
join
(
config
.
cuda
.
root
,
'lib64'
)))
theano/sandbox/gpuarray/__init__.py
浏览文件 @
645557f9
...
...
@@ -19,13 +19,6 @@ try:
except
ImportError
:
pygpu
=
None
AddConfigVar
(
'gpuarray.sync'
,
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling."""
,
BoolParam
(
False
),
in_c_key
=
True
)
# This is for documentation not to depend on the availability of pygpu
from
.type
import
(
GpuArrayType
,
GpuArrayVariable
,
GpuArrayConstant
,
GpuArraySharedVariable
,
gpuarray_shared_constructor
)
...
...
@@ -57,8 +50,9 @@ if pygpu:
import
theano.compile
theano
.
compile
.
shared_constructor
(
gpuarray_shared_constructor
)
optdb
.
add_tags
(
'gpuarray_opt'
,
'fast_run'
,
'fast_compile'
)
elif
config
.
gpuarray
.
init_device
!=
''
:
init_dev
(
config
.
gpuarray
.
init_device
)
elif
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)):
init_dev
(
config
.
init_gpu_device
)
from
.basic_ops
import
(
GpuAlloc
,
GpuContiguous
,
GpuEye
,
GpuFromHost
,
GpuJoin
,
GpuReshape
,
GpuSplit
,
HostFromGpu
)
...
...
@@ -70,7 +64,8 @@ if pygpu:
except
Exception
:
error
(
"Could not initialize pygpu, support disabled"
,
exc_info
=
True
)
else
:
if
(
config
.
gpuarray
.
init_device
!=
''
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'cuda'
)):
if
(
config
.
init_gpu_device
.
startswith
(
'cuda'
)
or
config
.
init_gpu_device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'opencl'
)
or
config
.
device
.
startswith
(
'cuda'
)):
error
(
"pygpu was configured but could not be imported"
,
exc_info
=
True
)
theano/sandbox/gpuarray/basic_ops.py
浏览文件 @
645557f9
...
...
@@ -2,11 +2,9 @@ import os
import
numpy
import
theano
from
theano
import
Op
,
Apply
from
theano
import
tensor
,
scalar
,
config
from
theano
import
Op
,
Apply
,
Type
,
Variable
from
theano
import
tensor
,
config
from
theano.gradient
import
grad_undefined
from
theano.scalar
import
Scalar
from
theano.tensor.basic
import
Alloc
,
Join
,
Split
from
theano.gof
import
HideC
...
...
@@ -17,7 +15,7 @@ from six.moves import xrange
try
:
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
pass
...
...
@@ -293,7 +291,6 @@ class GpuFromHost(Op):
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
type
=
node
.
outputs
[
0
]
.
type
z
[
0
]
=
gpuarray
.
array
(
x
)
def
grad
(
self
,
inputs
,
grads
):
...
...
@@ -312,254 +309,29 @@ class GpuFromHost(Op):
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
PyGpuArrayObject *
%(name)
s_tmp;
%(name)
s_tmp = PyArray_GETCONTIGUOUS(
%(inp)
s);
if (
%(name)
s_tmp == NULL)
%(fail)
s
Py_XDECREF(
%(out)
s);
%(out)
s = pygpu_fromhostdata(PyArray_DATA(
%(
inp)
s
),
get_typecode((PyObject *)PyArray_DESCR(
%(
inp)
s
)),
PyArray_NDIM(
%(
inp)
s
),
(size_t *)PyArray_DIMS(
%(
inp)
s
),
(ssize_t *)PyArray_STRIDES(
%(
inp)
s
),
%(out)
s = pygpu_fromhostdata(PyArray_DATA(
%(
name)
s_tmp
),
get_typecode((PyObject *)PyArray_DESCR(
%(
name)
s_tmp
)),
PyArray_NDIM(
%(
name)
s_tmp
),
(size_t *)PyArray_DIMS(
%(
name)
s_tmp
),
(ssize_t *)PyArray_STRIDES(
%(
name)
s_tmp
),
pygpu_default_context(),
Py_None);
if (
%(out)
s == NULL) {
%(fail)
s
}
Py_DECREF(
%(name)
s_tmp);
if (
%(out)
s == NULL)
%(fail)
s
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
4
,)
gpu_from_host
=
GpuFromHost
()
class
GpuFromCuda
(
Op
):
view_map
=
{
0
:
[
0
]}
__props__
=
()
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
CudaNdarrayType
):
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
broadcastable
=
x
.
broadcastable
,
dtype
=
x
.
dtype
)()])
def
perform
(
self
,
node
,
inp
,
out
):
x
,
=
inp
z
,
=
out
z
[
0
]
=
gpuarray
.
array
(
numpy
.
asarray
(
x
))
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
cuda_from_gpu
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
ev
,
=
eval_points
if
isinstance
(
ev
,
GpuArrayType
):
return
[
cuda_from_gpu
(
ev
)]
else
:
return
ev
def
infer_shape
(
self
,
node
,
xshp
):
return
xshp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<gpuarray/extension.h>'
,
'<gpuarray/types.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");'
,
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr;
gpudata *
%(name)
sdata;
CUcontext
%(name)
scur;
size_t *
%(name)
sdims;
ssize_t *
%(name)
sstr;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
%(name)
sdims = (size_t *)calloc(
%(in)
s->nd, sizeof(size_t));
if (
%(name)
sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)
s
}
%(name)
sstr = (ssize_t *)calloc(
%(in)
s->nd, sizeof(ssize_t));
if (
%(name)
sstr == NULL) {
free(
%(name)
sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)
s
}
for (unsigned int i = 0; i <
%(in)
s->nd; i++) {
%(name)
sdims[i] = (size_t)CudaNdarray_HOST_DIMS(
%(in)
s)[i];
%(name)
sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(
%(in)
s)[i]*4;
}
%(name)
sdata = cuda_make_buf(pygpu_default_context()->ctx,
(CUdeviceptr)
%(in)
s->devdata,
((size_t)
%(in)
s->data_allocated)*4);
if (
%(name)
sdata == NULL) {
Py_DECREF(
%(out)
s);
free(
%(name)
sdims);
free(
%(name)
sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = pygpu_fromgpudata(
%(name)
sdata, 0, GA_FLOAT,
%(in)
s->nd,
%(name)
sdims,
%(name)
sstr,
pygpu_default_context(), 1,
(PyObject *)
%(in)
s,
(PyObject *)&PyGpuArrayType);
pygpu_default_context()->ops->buffer_release(
%(name)
sdata);
free(
%(name)
sdims);
free(
%(name)
sstr);
if (
%(out)
s == NULL) {
%(fail)
s
}
"""
%
{
'name'
:
name
,
'in'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
5
,)
gpu_from_cuda
=
GpuFromCuda
()
class
CudaFromGpu
(
Op
):
view_map
=
{
0
:
[
0
]}
__props__
=
()
def
make_node
(
self
,
x
):
from
theano.sandbox.cuda
import
CudaNdarrayType
if
not
isinstance
(
x
.
type
,
GpuArrayType
):
raise
TypeError
(
x
)
if
x
.
type
.
dtype
!=
'float32'
:
raise
TypeError
(
x
)
return
Apply
(
self
,
[
x
],
[
CudaNdarrayType
(
broadcastable
=
x
.
broadcastable
)()])
def
perform
(
self
,
node
,
inp
,
out
):
from
theano.sandbox.cuda
import
filter
as
cuda_filter
x
,
=
inp
z
,
=
out
z
[
0
]
=
cuda_filter
(
theano
.
_asarray
(
x
,
dtype
=
'float32'
),
tuple
([
0
]
*
x
.
ndim
),
0
,
z
[
0
])
def
grad
(
self
,
inputs
,
grads
):
gz
,
=
grads
return
[
gpu_from_cuda
(
gz
)]
def
R_op
(
self
,
inputs
,
eval_points
):
from
theano.sandbox.cuda
import
CudaNdarrayType
ev
,
=
eval_points
if
(
isinstance
(
ev
,
CudaNdarrayType
)):
return
[
gpu_from_cuda
(
ev
)]
else
:
return
[
ev
]
def
infer_shape
(
self
,
node
,
shp
):
return
shp
def
c_headers
(
self
):
return
[
'<cuda_ndarray.cuh>'
,
'<gpuarray/extension.h>'
,
'<cuda.h>'
]
def
c_header_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'include'
))
return
ret
def
c_lib_dirs
(
self
):
import
cuda_ndarray
ret
=
[
os
.
path
.
dirname
(
cuda_ndarray
.
__file__
)]
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
ret
.
append
(
os
.
path
.
join
(
cuda_root
,
'lib'
))
return
ret
def
c_libraries
(
self
):
return
[
'cudart'
,
'cublas'
,
'cuda'
]
def
c_support_code
(
self
):
return
"""
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def
c_init_code
(
self
):
return
[
'cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");'
,
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
return
"""
int
%(name)
serr = 0,
%(name)
si;
CUcontext
%(name)
scur;
cuCtxGetCurrent(&
%(name)
scur);
if (
%(name)
scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)
s
}
if (GpuArray_sync(&
%(inp)
s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
%(fail)
s
}
Py_XDECREF(
%(out)
s);
%(out)
s = (CudaNdarray *)CudaNdarray_new_nd(
%(inp)
s->ga.nd);
if (!
%(out)
s) {
%(fail)
s
}
for (
%(name)
si = 0;
%(name)
si <
%(inp)
s->ga.nd;
%(name)
si++) {
CudaNdarray_set_dim(
%(out)
s,
%(name)
si,
%(inp)
s->ga.dimensions[
%(name)
si]);
CudaNdarray_set_stride(
%(out)
s,
%(name)
si,
%(inp)
s->ga.strides[
%(name)
si]/4);
}
%(name)
serr = CudaNdarray_set_device_data(
%(out)
s,
(float *)(((char *)cuda_get_ptr(
%(inp)
s->ga.data))+
%(inp)
s->ga.offset),
(PyObject *)
%(inp)
s);
if (
%(name)
serr) {
%(fail)
s
}
"""
%
{
'name'
:
name
,
'inp'
:
inputs
[
0
],
'out'
:
outputs
[
0
],
'fail'
:
sub
[
'fail'
]}
def
c_code_cache_version
(
self
):
return
(
3
,)
cuda_from_gpu
=
CudaFromGpu
()
gpu_from_host
=
GpuFromHost
()
class
GpuAlloc
(
HideC
,
Alloc
):
...
...
@@ -572,7 +344,7 @@ class GpuAlloc(HideC, Alloc):
value is always 0, so the c code call memset as it is faster.
"""
__props__
=
(
'memset_0'
,)
_f16_ok
=
True
...
...
@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
sh
,
bcast
=
self
.
validate_shape
(
shape
)
if
value
.
ndim
>
len
(
sh
):
TypeError
(
"The GpuAlloc value to use has more dimensions "
"than the specified shape"
,
v
.
ndim
,
len
(
sh
))
"than the specified shape"
,
v
alue
.
ndim
,
len
(
sh
))
otype
=
value
.
type
.
clone
(
broadcastable
=
bcast
)
return
Apply
(
self
,
[
value
]
+
sh
,
[
otype
()])
...
...
@@ -686,29 +458,28 @@ class GpuAlloc(HideC, Alloc):
return
(
2
,)
def
do_constant_folding
(
self
,
node
):
from
.
import
subtensor
,
blas
for
client
in
node
.
outputs
[
0
]
.
clients
:
if
client
[
0
]
==
'output'
:
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return
False
elif
(
# The following ops work inplace of their input id 0.
client
[
1
]
==
0
and
isinstance
(
client
[
0
]
.
op
,
(
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuIncSubtensor
,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuAdvancedIncSubtensor1
,
theano
.
sandbox
.
gpuarray
.
subtensor
.
GpuAdvancedIncSubtensor1_dev20
,
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGemm
,
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGemv
,
theano
.
sandbox
.
gpuarray
.
blas
.
GpuGer
,
))):
# The following ops work inplace of their input id 0.
elif
(
client
[
1
]
==
0
and
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
isinstance
(
client
[
0
]
.
op
,
(
subtensor
.
GpuIncSubtensor
,
subtensor
.
GpuAdvancedIncSubtensor1
,
subtensor
.
GpuAdvancedIncSubtensor1_dev20
,
blas
.
GpuGemm
,
blas
.
GpuGemv
,
blas
.
GpuGer
)
)):
return
False
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
...
...
@@ -795,7 +566,7 @@ class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
__props__
=
()
...
...
@@ -980,7 +751,7 @@ class GpuJoin(HideC, Join):
node
=
Join
.
make_node
(
self
,
axis
,
*
tensors
)
return
Apply
(
self
,
[
node
.
inputs
[
0
]]
+
list
(
map
(
as_gpuarray_variable
,
tensors
)),
tensors
)),
[
GpuArrayType
(
broadcastable
=
node
.
outputs
[
0
]
.
broadcastable
,
dtype
=
node
.
outputs
[
0
]
.
dtype
)()])
...
...
@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
code
=
code
,
name
=
"k"
,
params
=
[
gpuarray
.
GpuArray
,
gpuarray
.
SIZE
,
gpuarray
.
SIZE
],
flags
=
Kernel
.
get_flags
(
self
.
dtype
),
objvar
=
'k_eye_'
+
name
,
)]
objvar
=
'k_eye_'
+
name
)]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
n
,
m
=
inp
...
...
theano/sandbox/gpuarray/conv.py
浏览文件 @
645557f9
...
...
@@ -5,17 +5,15 @@ import theano
from
theano
import
config
,
gof
try
:
import
pygpu
from
pygpu
import
gpuarray
except
ImportError
:
pass
from
six.moves
import
reduce
from
.comp
import
NVCC_compiler
from
.type
import
GpuArrayType
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
)
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
from
theano.gof
import
utils
class
GpuConv
(
GpuKernelBase
,
gof
.
Op
):
"""
Implement the batched and stacked 2d convolution on the gpu.
...
...
@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
raise
ValueError
(
mode
)
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
logical_img_hw
=
None
,
logical_kern_hw
=
None
,
logical_kern_align_top
=
True
,
version
=-
1
,
direction_hint
=
None
,
verbose
=
0
,
kshp
=
None
,
imshp
=
None
,
max_threads_dim0
=
None
,
nkern
=
None
,
bsize
=
None
,
fft_opt
=
True
):
subsample
=
(
1
,
1
),
logical_img_hw
=
None
,
logical_kern_hw
=
None
,
logical_kern_align_top
=
True
,
version
=-
1
,
direction_hint
=
None
,
verbose
=
0
,
kshp
=
None
,
imshp
=
None
,
max_threads_dim0
=
None
,
nkern
=
None
,
bsize
=
None
,
fft_opt
=
True
):
self
.
border_mode
=
border_mode
self
.
subsample
=
subsample
if
logical_img_hw
is
not
None
:
...
...
@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def
flops
(
self
,
inputs
,
outputs
):
"""
Useful with the hack in profilemode to print the MFlops.
"""
images
,
kerns
=
inputs
out
,
=
outputs
...
...
@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb
=
0
if
self
.
kshp
is
not
None
:
nb
=
self
.
kshp
[
1
]
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'<stdint.h>'
,
'<stdio.h>'
,
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
return
[
'<stdio.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
return
(
0
,
21
)
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
return
(
0
,
22
)
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
img
,
kern
=
inp
...
...
theano/sandbox/gpuarray/dnn.py
浏览文件 @
645557f9
...
...
@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from
.nnet
import
GpuSoftmax
from
.opt
import
gpu_seqopt
,
register_opt
,
conv_groupopt
,
op_lifter
from
.opt_util
import
alpha_merge
,
output_merge
# We need to import this to define the flags.
from
theano.sandbox
import
dnn_flags
# noqa
from
.opt_util
import
alpha_merge
,
output_merge
,
inplace_allocempty
def
dnn_available
():
...
...
@@ -50,7 +47,6 @@ def dnn_available():
dnn_available
.
avail
=
False
preambule
=
"""
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
...
...
@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params
=
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
)]
if
config
.
dnn
.
include_path
:
params
.
append
(
"-I"
+
config
.
dnn
.
include_path
)
if
config
.
dnn
.
library_path
:
params
.
append
(
"-L"
+
config
.
dnn
.
library_path
)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp
,
out
,
err
=
GCC_compiler
.
try_flags
(
[
"-l"
,
"cudnn"
,
"-I"
+
os
.
path
.
dirname
(
__file__
),
"-I"
+
config
.
dnn
.
include_path
,
"-L"
+
config
.
dnn
.
library_path
],
preambule
=
preambule
,
body
=
body
,
params
,
preambule
=
preambule
,
body
=
body
,
try_run
=
False
,
output
=
True
)
dnn_available
.
avail
=
comp
...
...
@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn'
,
'fast_compile'
,
'fast_run'
,
'cudnn'
)
@local_optimizer
([
GpuDnnConv
],
inplace
=
True
)
def
local_dnn_conv_inplace
(
node
):
if
type
(
node
.
op
)
!=
GpuDnnConv
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
@inplace_allocempty
(
GpuDnnConv
,
2
)
def
local_dnn_conv_inplace
(
node
,
inputs
):
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@local_optimizer
([
GpuDnnConvGradW
],
inplace
=
True
)
def
local_dnn_convgw_inplace
(
node
):
if
type
(
node
.
op
)
!=
GpuDnnConvGradW
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
@inplace_allocempty
(
GpuDnnConvGradW
,
2
)
def
local_dnn_convgw_inplace
(
node
,
inputs
):
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
@local_optimizer
([
GpuDnnConvGradI
],
inplace
=
True
)
def
local_dnn_convgi_inplace
(
node
):
if
type
(
node
.
op
)
!=
GpuDnnConvGradI
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
dest
=
inputs
[
2
]
if
(
dest
.
owner
and
isinstance
(
dest
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
dest
.
clients
)
>
1
):
inputs
[
2
]
=
GpuAllocEmpty
(
dest
.
owner
.
op
.
dtype
)(
*
dest
.
owner
.
inputs
)
@inplace_allocempty
(
GpuDnnConvGradI
,
2
)
def
local_dnn_convgi_inplace
(
node
,
inputs
):
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
,
inplace
=
True
)(
*
inputs
)]
optdb
.
register
(
'local_dnna_conv_inplace'
,
tensor
.
opt
.
in2out
(
local_dnn_conv_inplace
,
local_dnn_convgw_inplace
,
local_dnn_convgi_inplace
,
name
=
"local_dnn_conv_inplace"
),
name
=
"local_dnn
a
_conv_inplace"
),
70.0
,
'fast_run'
,
'inplace'
,
'gpuarray'
,
'cudnn'
)
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_conv_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convw_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
nd
=
4
)
@alpha_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
)
def
local_dnn_convi_alpha_merge
(
node
,
*
inputs
):
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConv
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_conv_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConv
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConvGradW
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convw_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvGradW
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
@register_opt
(
'cudnn'
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
,
nd
=
4
)
@output_merge
(
GpuDnnConvGradI
,
alpha_in
=
4
,
beta_in
=
5
,
out_in
=
2
)
def
local_dnn_convi_output_merge
(
node
,
*
inputs
):
inputs
=
inputs
[
0
:
2
]
+
(
gpu_contiguous
(
inputs
[
2
]),)
+
inputs
[
3
:]
return
[
GpuDnnConvGradI
(
algo
=
node
.
op
.
algo
)(
*
inputs
)]
...
...
theano/sandbox/gpuarray/elemwise.py
浏览文件 @
645557f9
from
__future__
import
print_function
import
copy
import
os
from
theano.compat
import
izip
import
numpy
import
theano
from
theano
import
Apply
,
scalar
,
config
from
theano
import
scalar
as
scal
from
six.moves
import
StringIO
,
xrange
from
theano.gof.utils
import
MethodNotDefined
from
theano.gof.cmodule
import
GCC_compiler
from
theano.scalar
import
Scalar
from
theano.tensor.elemwise
import
(
Elemwise
,
DimShuffle
,
CAReduceDtype
)
...
...
@@ -94,7 +91,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
try
:
support_code
=
self
.
scalar_op
.
c_support_code
()
if
(
support_code
.
strip
()
!=
"#define THEANO_MACRO_MOD(x,y) (x
%
y)"
and
support_code
.
strip
()
!=
""
):
support_code
.
strip
()
!=
""
):
# The macro is fine, the C++ struct is not.
raise
SupportCodeError
(
support_code
)
except
MethodNotDefined
:
...
...
@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins
=
[
scalar
.
get_scalar_type
(
i
.
dtype
)
for
i
in
node
.
inputs
]
outs
=
[
make_argument
(
o
,
'o
%
d'
%
(
n
,))
for
n
,
o
in
enumerate
(
node
.
outputs
)
if
n
ot
n
in
self
.
inplace_pattern
]
enumerate
(
node
.
outputs
)
if
n
not
in
self
.
inplace_pattern
]
scal_v_outs
=
[
scalar
.
get_scalar_type
(
o
.
dtype
)
for
o
in
node
.
outputs
]
fake_node
=
Apply
(
self
.
scalar_op
,
[
i
()
for
i
in
scal_v_ins
],
...
...
@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else
:
scal_out
.
append
(
arg
.
name
+
'[i]'
)
kop
=
self
.
scalar_op
.
c_code
(
fake_node
,
nodename
+
'_scalar'
,
kop
=
self
.
scalar_op
.
c_code
(
fake_node
,
nodename
+
'_scalar'
,
scal_in
,
scal_out
,
dict
(
fail
=
'return;'
))
...
...
@@ -169,27 +166,12 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
(
"npy_float16"
,
"ga_half"
),
(
"npy_float32"
,
"ga_float"
),
(
"npy_float64"
,
"ga_double"
),
]:
]:
kop
=
kop
.
replace
(
npy
,
ga
)
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_compiler
(
self
):
return
GCC_compiler
return
ElemwiseKernel
(
None
,
inps
+
outs
,
kop
,
preamble
=
support_code
)
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_support_code
(
self
):
return
self
.
scalar_op
.
c_support_code
()
...
...
@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
node
.
outputs
[
0
]
.
type
.
dtype
),
objvar
=
'elem_
%
d_
%
s'
%
(
nd
,
nodename
))]
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
...
...
@@ -399,7 +376,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
param
.
append
(
"(void *)&
%(z)
s->ga.dimensions[
%(i)
d]"
%
dict
(
z
=
outputs
[
0
],
i
=
i
))
for
n
,
(
name
,
var
)
in
enumerate
(
zip
(
inputs
+
outputs
,
node
.
inputs
+
node
.
outputs
)):
node
.
inputs
+
node
.
outputs
)):
if
(
n
-
len
(
inputs
))
in
self
.
inplace_pattern
:
continue
dtype
=
dtype_to_ctype
(
var
.
dtype
)
...
...
@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&
%(kname)
s, err));
%(fail)
s;
}
"""
%
dict
(
kname
=
kname
,
fail
=
fail
)
"""
%
dict
(
kname
=
kname
,
fail
=
fail
)
if
config
.
gpuarray
.
sync
:
code
+=
"""
err = GpuArray_sync(&
%(z)
s->ga);
...
...
@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def
c_code_cache_version
(
self
):
ver
=
self
.
scalar_op
.
c_code_cache_version
()
if
ver
:
return
(
3
,
ver
)
return
(
4
,
ver
)
else
:
return
ver
...
...
@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res
=
input
res
=
res
.
transpose
(
self
.
shuffle
+
self
.
drop
)
res
=
res
.
transpose
(
self
.
shuffle
+
self
.
drop
)
shape
=
list
(
res
.
shape
[:
len
(
self
.
shuffle
)])
for
augm
in
self
.
augment
:
...
...
@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp);
return res;
}
"""
%
dict
(
shuffle
=
', '
.
join
(
str
(
a
)
for
a
in
(
self
.
shuffle
+
self
.
drop
)),
"""
%
dict
(
shuffle
=
', '
.
join
(
str
(
a
)
for
a
in
(
self
.
shuffle
+
self
.
drop
)),
name
=
name
,
nd_out
=
len
(
self
.
new_order
),
copy_shape
=
copy_shape
(
len
(
self
.
new_order
)))
...
...
@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return
process
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
5
,)
class
GpuCAReduceCuda
(
GpuKernelBase
,
HideC
,
CAReduceDtype
):
...
...
@@ -581,7 +558,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction.
Examples
--------
When scalar_op is a theano.scalar.basic.Add instance:
...
...
@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if
self
.
pre_scalar_op
:
# Currently we only tested pre_scalar_op that don't cause
# upcast.
d1
=
self
.
__class__
(
scalar_op
=
self
.
scalar_op
)(
Elemwise
(
self
.
pre_scalar_op
)(
x
))
assert
d1
.
dtype
==
ret
.
outputs
[
0
]
.
dtype
assert
Elemwise
(
self
.
pre_scalar_op
)(
x
)
.
dtype
==
x
.
dtype
if
self
.
reduce_mask
is
None
:
if
self
.
axis
is
None
:
...
...
@@ -687,8 +662,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if
(
x
.
type
.
ndim
!=
len
(
self
.
reduce_mask
)):
raise
TypeError
(
"x must have rank
%
i"
%
len
(
self
.
reduce_mask
))
if
(
"complex"
in
x
.
dtype
or
"complex"
in
ret
.
outputs
[
0
]
.
dtype
or
"complex"
in
self
.
_acc_dtype
(
x
.
dtype
)):
"complex"
in
ret
.
outputs
[
0
]
.
dtype
or
"complex"
in
self
.
_acc_dtype
(
x
.
dtype
)):
raise
NotImplementedError
(
"We don't support complex in gpu reduction"
)
return
Apply
(
self
,
[
x
],
[
GpuArrayType
(
ret
.
outputs
[
0
]
.
dtype
,
ret
.
outputs
[
0
]
.
type
.
broadcastable
)()])
...
...
@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return
False
return
True
def
c_header_dirs
(
self
):
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
x
,
=
inp
...
...
@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sio
=
StringIO
()
fail
=
sub
[
'fail'
]
ctx
=
sub
[
'context'
]
# check input
print
(
"""
...
...
@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(
%(z)
s);
%(z)
s = pygpu_empty(
%(nd_out)
s, new_dims,
%(out_typecode)
s, GA_C_ORDER,
pygpu_default_context(),
Py_None);
pygpu_default_context(), Py_None);
if (NULL ==
%(z)
s)
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...
...
@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
print
(
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS){'
%
locals
(),
file
=
sio
)
print
(
'if(
%(x)
s->ga.flags & GA_C_CONTIGUOUS){'
%
locals
(),
file
=
sio
)
self
.
c_code_reduce_ccontig
(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}else{"
,
file
=
sio
)
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
print
(
"}"
,
file
=
sio
)
else
:
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
getattr
(
self
,
'c_code_reduce_
%
s'
%
(
''
.
join
(
str
(
i
)
for
i
in
self
.
reduce_mask
)))(
sio
,
node
,
name
,
x
,
z
,
fail
)
# \end bracket the reduction ...
...
...
@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else
:
assert
isinstance
(
self
.
scalar_op
,
(
scal
.
Maximum
,
scal
.
Minimum
))
if
self
.
pre_scalar_op
:
# TODO
, multi_dtype!
#dtype = node.inputs[0].dtype
if
self
.
pre_scalar_op
:
# TODO
: multiple dtypes
#
dtype = node.inputs[0].dtype
dtype
=
'float32'
dummy_var
=
scal
.
Scalar
(
dtype
=
dtype
)()
...
...
@@ -1171,7 +1139,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Parameters
----------
node, name, sub
node, name, sub
These should be passed through from the original call to c_code.
"""
...
...
@@ -1411,7 +1379,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def
c_code_reduce_01X
(
self
,
sio
,
node
,
name
,
x
,
z
,
fail
,
N
):
"""
Parameters
----------
N
...
...
@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"""
%
locals
(),
file
=
sio
)
def
c_code_cache_version_apply
(
self
,
node
):
version
=
[
1
6
]
# the version corresponding to the c code in this Op
version
=
[
1
7
]
# the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node
=
Apply
(
self
.
scalar_op
,
[
Scalar
(
dtype
=
input
.
type
.
dtype
)()
for
input
in
node
.
inputs
],
[
Scalar
(
dtype
=
output
.
type
.
dtype
)()
for
output
in
node
.
outputs
])
version
.
extend
(
self
.
scalar_op
.
c_code_cache_version
())
for
i
in
node
.
inputs
+
node
.
outputs
:
version
.
extend
(
Scalar
(
dtype
=
i
.
type
.
dtype
)
.
c_code_cache_version
())
...
...
@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype
=
node
.
inputs
[
0
]
.
dtype
out_dtype
=
node
.
outputs
[
0
]
.
dtype
acc_dtype
=
self
.
_acc_dtype
(
node
.
inputs
[
0
]
.
dtype
)
flags
=
Kernel
.
get_flags
(
in_dtype
,
acc_dtype
,
out_dtype
)
flags
=
Kernel
.
get_flags
(
in_dtype
,
acc_dtype
,
out_dtype
)
in_type
=
gpuarray
.
dtype_to_ctype
(
in_dtype
)
out_type
=
gpuarray
.
dtype_to_ctype
(
out_dtype
)
acc_type
=
gpuarray
.
dtype_to_ctype
(
acc_dtype
)
...
...
@@ -2106,10 +2071,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
]
kernels
.
append
(
Kernel
(
code
=
sio
.
getvalue
(),
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
#01, 011, 0111
#
01, 011, 0111
if
(
0
==
self
.
reduce_mask
[
0
]
and
all
(
self
.
reduce_mask
[
1
:])
and
nd_in
in
[
2
,
3
,
4
]):
all
(
self
.
reduce_mask
[
1
:])
and
nd_in
in
[
2
,
3
,
4
]):
# this kernel uses one block for each row.
# threads per block for each element per row.
...
...
@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
init
=
self
.
_k_init
(
node
,
nodename
)
decl
,
kname
,
params
,
k_var
=
self
.
_k_decl
(
node
,
nodename
,
pattern
=
"010_inner"
)
reducebuf
=
self
.
_k_reduce_buf_multiple
(
'Z[i0 * sZ0 + i2*sZ1]'
,
...
...
@@ -2515,7 +2480,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kernels
.
append
(
Kernel
(
code
=
sio
.
getvalue
(),
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
if
self
.
reduce_mask
==
(
0
,
0
,
1
,
1
):
# this kernel uses one block for each row,
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf
=
self
.
_k_reduce_buf
(
'Z[i0 * sZ0 + i1 * sZ1]'
,
node
,
nodename
,
sub
=
{})
...
...
@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{},
True
)
reduce_init
=
self
.
_assign_init
(
load_in
+
"(A[blockIdx.x * sA1])"
)
kname
=
"kernel_reduce_1011"
k_var
=
"kernel_reduce_1011_"
+
nodename
k_var
=
"kernel_reduce_1011_"
+
nodename
sio
=
StringIO
()
print
(
"""
KERNEL void
%(kname)
s(
...
...
@@ -2712,7 +2677,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
# cache the kernel object
self
.
get_kernel_cache
(
node
)
return
super
(
GpuCAReduceCPY
,
self
)
.
make_thunk
(
node
,
storage_map
,
compute_map
,
no_recycling
)
compute_map
,
no_recycling
)
def
get_kernel_cache
(
self
,
node
):
attr
=
'@cache_reduction_k'
...
...
@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags
=
Kernel
.
get_flags
(
node
.
inputs
[
0
]
.
type
.
dtype
,
acc_dtype
,
node
.
outputs
[
0
]
.
type
.
dtype
),
objvar
=
'k_reduk_'
+
name
)]
objvar
=
'k_reduk_'
+
name
)]
def
c_code
(
self
,
node
,
name
,
inp
,
out
,
sub
):
if
not
any
(
getattr
(
self
,
'redux'
,
[
node
.
inputs
[
0
]
.
ndim
!=
0
])):
...
...
@@ -2768,8 +2733,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (
%(sync)
d)
GpuArray_sync(&
%(out)
s->ga);
"""
%
dict
(
out
=
out
[
0
],
inp
=
inp
[
0
],
fail
=
sub
[
'fail'
],
sync
=
bool
(
config
.
gpuarray
.
sync
))
"""
%
dict
(
out
=
out
[
0
],
inp
=
inp
[
0
],
fail
=
sub
[
'fail'
],
sync
=
bool
(
config
.
gpuarray
.
sync
))
k
=
self
.
get_kernel_cache
(
node
)
_
,
src
,
_
,
ls
=
k
.
_get_basic_kernel
(
k
.
init_local_size
,
node
.
inputs
[
0
]
.
ndim
)
...
...
@@ -2816,8 +2781,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
}
}
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
"""
%
dict
(
output
=
output
,
nd_out
=
nd_out
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
else
:
code
+=
"""
if (
%(output)
s == NULL ||
%(output)
s->ga.nd != 0) {
...
...
@@ -2828,8 +2793,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
}
}
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
out_type
=
dtype_to_typecode
(
node
.
outputs
[
0
]
.
type
.
dtype
))
if
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
:
code
+=
"""
...
...
@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)
s, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (!tmp)
%(fail)
s
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
acc_type
=
dtype_to_typecode
(
acc_dtype
))
"""
%
dict
(
output
=
output
,
fail
=
sub
[
'fail'
],
acc_type
=
dtype_to_typecode
(
acc_dtype
))
else
:
code
+=
"""
tmp =
%(output)
s;
Py_INCREF(tmp);
"""
%
dict
(
output
=
output
)
"""
%
dict
(
output
=
output
)
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
...
...
@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code
+=
"""
args[0] = &n;
args[1] = tmp->ga.data;
"""
%
dict
(
output
=
output
)
"""
%
dict
(
output
=
output
)
p
=
2
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
...
...
@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[
%(i)
s] =
%(input)
s->ga.dimensions[
%(i)
s];
args[
%(p)
s] = &proxy_dim[
%(i)
s];
n *=
%(input)
s->ga.dimensions[
%(i)
s];
"""
%
dict
(
i
=
i
,
p
=
p
,
input
=
input
)
"""
%
dict
(
i
=
i
,
p
=
p
,
input
=
input
)
p
+=
1
if
not
redux
[
i
]:
code
+=
"gs *=
%(input)
s->ga.dimensions[
%(i)
s];"
%
dict
(
input
=
input
,
i
=
i
)
...
...
@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[
%(p)
s] =
%(input)
s->ga.data;
proxy_off =
%(input)
s->ga.offset;
args[
%(p)
s+1] = &proxy_off;
"""
%
dict
(
p
=
p
,
input
=
input
)
"""
%
dict
(
p
=
p
,
input
=
input
)
p
+=
2
for
i
in
range
(
node
.
inputs
[
0
]
.
ndim
):
code
+=
"""
proxy_str[
%(i)
s] =
%(input)
s->ga.strides[
%(i)
s];
args[
%(p)
s] = &proxy_str[
%(i)
s];
"""
%
dict
(
p
=
p
,
i
=
i
,
input
=
input
)
"""
%
dict
(
p
=
p
,
i
=
i
,
input
=
input
)
p
+=
1
code
+=
"""
...
...
@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)
s
}
}
"""
%
dict
(
k_var
=
'k_reduk_'
+
name
,
sync
=
bool
(
config
.
gpuarray
.
sync
),
ls
=
ls
,
fail
=
sub
[
'fail'
],
output
=
output
,
input
=
input
,
cast_out
=
bool
(
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
))
"""
%
dict
(
k_var
=
'k_reduk_'
+
name
,
sync
=
bool
(
config
.
gpuarray
.
sync
),
ls
=
ls
,
fail
=
sub
[
'fail'
],
output
=
output
,
input
=
input
,
cast_out
=
bool
(
acc_dtype
!=
node
.
outputs
[
0
]
.
type
.
dtype
))
return
code
def
c_code_cache_version
(
self
):
return
(
1
,
self
.
GpuKernelBase_version
)
return
(
2
,
self
.
GpuKernelBase_version
)
def
generate_kernel
(
self
,
node
,
odtype
,
redux
):
if
isinstance
(
self
.
scalar_op
,
scalar
.
basic
.
Add
):
...
...
@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux
=
self
.
redux
if
any
(
redux
):
output
[
0
]
=
self
.
get_kernel_cache
(
node
)(
input
)
.
astype
(
copy
=
False
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
output
[
0
]
=
self
.
get_kernel_cache
(
node
)(
input
)
.
astype
(
copy
=
False
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
else
:
output
[
0
]
=
pygpu
.
gpuarray
.
array
(
input
,
copy
=
True
,
dtype
=
node
.
outputs
[
0
]
.
type
.
dtype
)
...
...
theano/sandbox/gpuarray/kernel_codegen.py
浏览文件 @
645557f9
...
...
@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
"""
try
:
import
pygpu
from
pygpu
import
gpuarray
except
ImportError
:
pass
def
nvcc_kernel
(
name
,
params
,
body
):
"""
Return the c code of a kernel function.
...
...
@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
ctype
=
gpuarray
.
dtype_to_ctype
(
dtype
)
return
[
# get max of buf (trashing all but buf[0])
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
# get max of buf (trashing all but buf[0])
return
[
inline_reduce_max
(
N
,
buf
,
threadPos
,
threadCount
),
'__syncthreads()'
,
(
'
%
s row_max = '
+
buf
+
'[0]'
)
%
ctype
,
'__syncthreads()'
,
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = exp('
+
buf2
+
'[__i] - row_max)'
,
buf2
+
'[__i] = '
+
buf
+
'[__i]'
,
'}'
,
'__syncthreads()'
,
inline_reduce_sum
(
N
,
buf
,
threadPos
,
threadCount
),
...
...
@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()'
,
# divide each exp() result by the sum to complete the job.
'for(int __i='
+
threadPos
+
'; __i<'
+
N
+
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'; __i+='
+
threadCount
+
'){'
,
buf
+
'[__i] = '
+
buf2
+
'[__i] / row_sum'
,
'}'
,
'__syncthreads()'
,
]
...
...
@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
for their reduction.
Example: return "
%(a)
s +
%(b)
s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
...
...
@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
loop_line
=
manner_fn
(
"red"
,
manner_init
(
"
%(load_x)
s(
%(x)
s[i *
%(stride_x)
s])"
%
locals
()))
loop_line2
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[i]"
%
buf
)
"
%
s[i]"
%
buf
)
r_16
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+16]"
%
(
buf
,
pos
))
r_8
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+8]"
%
(
buf
,
pos
))
r_4
=
manner_fn
(
"
%
s[
%
s]"
%
(
buf
,
pos
),
"
%
s[
%
s+4]"
%
(
buf
,
pos
))
...
...
@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
Parameters
----------
N
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(dtype).
...
...
theano/sandbox/gpuarray/neighbours.py
浏览文件 @
645557f9
import
os
import
numpy
from
theano
import
Op
,
Apply
,
config
from
theano.gof
import
local_optimizer
from
theano.tensor.nnet.neighbours
import
Images2Neibs
import
theano.tensor
as
T
try
:
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
host_from_gpu
,
gpu_from_host
,
GpuKernelBase
,
Kernel
)
from
.basic_ops
import
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
from
.opt
import
register_opt
as
register_gpu_opt
,
op_lifter
from
.type
import
GpuArrayType
from
.comp
import
NVCC_compiler
class
GpuImages2Neibs
(
GpuKernelBase
,
Images2Neibs
,
Op
):
...
...
@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype
=
ten4
.
type
.
dtype
)()])
def
c_code_cache_version
(
self
):
return
(
1
0
,
1
)
return
(
1
1
,
)
def
c_headers
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_init_code
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
return
[
'setup_ext_cuda();'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_ten4
=
node
.
inputs
[
0
]
.
dtype
...
...
theano/sandbox/gpuarray/nerv.py
浏览文件 @
645557f9
...
...
@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt
()
@alpha_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
nd
=
2
)
@alpha_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gemm16_alpha_merge
(
node
,
*
inputs
):
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
@opt.register_opt
()
@output_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
,
nd
=
2
)
@output_merge
(
Gemm16
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gemm16_output_merge
(
node
,
*
inputs
):
return
[
Gemm16
(
relu
=
node
.
op
.
relu
)(
*
inputs
)]
...
...
theano/sandbox/gpuarray/nnet.py
浏览文件 @
645557f9
from
__future__
import
print_function
import
numpy
import
os
from
theano
import
Op
,
Apply
,
config
from
six
import
StringIO
try
:
import
pygpu
from
pygpu
import
gpuarray
,
elemwise
from
pygpu
import
gpuarray
except
ImportError
:
pass
from
.basic_ops
import
(
as_gpuarray_variable
,
GpuKernelBase
,
Kernel
)
from
.type
import
GpuArrayType
from
.kernel_codegen
import
(
nvcc_kernel
,
inline_softmax
,
inline_softmax_fixed_shared
)
inline_softmax
,
inline_softmax_fixed_shared
)
from
.fp16_help
import
work_dtype
,
load_w
,
write_w
...
...
@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
am
=
y_idx
.
type
()
return
Apply
(
self
,
[
x
,
b
,
y_idx
],
[
nll
,
sm
,
am
])
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
gpu_kernels
(
self
,
node
,
nodename
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
...
@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return
sio
.
getvalue
()
def
c_code_cache_version
(
self
):
return
(
7
,)
return
(
8
,)
gpu_crossentropy_softmax_argmax_1hot_with_bias
=
GpuCrossentropySoftmaxArgmax1HotWithBias
()
...
...
@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return
Apply
(
self
,
[
dnll
,
sm
,
y_idx
],
[
sm
.
type
()])
def
c_code_cache_version
(
self
):
return
(
10
,)
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
return
(
11
,)
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
typecode_dx
=
pygpu
.
gpuarray
.
dtype_to_typecode
(
node
.
outputs
[
0
]
.
dtype
)
...
...
@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
return
shape
def
c_code_cache_version
(
self
):
return
(
14
,)
+
inline_softmax
.
code_version
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
return
(
15
,)
+
inline_softmax
.
code_version
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
...
@@ -665,60 +637,60 @@ class GpuSoftmax(GpuKernelBase, Op):
]
kernels
=
[]
kname
=
"kSoftmax"
k_var
=
"kSoftmax_"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
k_var
=
"kSoftmax_"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] =
%
s(x[blockIDX * sx0 + tx * sx1])"
%
load_x
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
,
dtype
=
work_sm
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] =
%
s(buf[tx])"
%
write_sm
,
"}"
,
"__syncthreads()"
,
"}"
,
])
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] =
%
s(x[blockIDX * sx0 + tx * sx1])"
%
load_x
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
,
dtype
=
work_sm
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] =
%
s(buf[tx])"
%
write_sm
,
"}"
,
"__syncthreads()"
,
"}"
,
])
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmax_fixed_shared"
k_var
=
"kSoftmax_fixed_shared"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
k_var
=
"kSoftmax_fixed_shared"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const
%
s *x_ptr = &x[blockIDX * sx0]"
%
type_x
,
"
%
s *sm_ptr = &sm[blockIDX * sm_s0]"
%
type_sm
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
load_x
,
'sm_ptr'
,
'sm_s1'
,
write_sm
,
'threadIdx.x'
,
'blockDim.x'
,
dtype
=
work_sm
),
"__syncthreads()"
,
"}"
,
])
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const
%
s *x_ptr = &x[blockIDX * sx0]"
%
type_x
,
"
%
s *sm_ptr = &sm[blockIDX * sm_s0]"
%
type_sm
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
load_x
,
'sm_ptr'
,
'sm_s1'
,
write_sm
,
'threadIdx.x'
,
'blockDim.x'
,
dtype
=
work_sm
),
"__syncthreads()"
,
"}"
,
])
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
return
kernels
...
...
@@ -743,26 +715,13 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return
Apply
(
self
,
[
x
,
b
],
[
x
.
type
()])
def
infer_shape
(
self
,
node
,
shape
):
return
[
shape
[
0
]]
return
[
shape
[
0
]]
def
c_code_cache_version
(
self
):
return
(
13
,)
+
inline_softmax
.
code_version
def
c_header_dirs
(
self
):
if
pygpu
.
get_default_context
()
.
kind
==
'opencl'
:
raise
MethodNotDefined
(
'cuda only'
)
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
return
(
14
,)
+
inline_softmax
.
code_version
def
c_headers
(
self
):
return
[
'cuda.h'
,
'<gpuarray/extension.h>'
,
'<numpy_compat.h>'
,
'<gpuarray/ext_cuda.h>'
,
'<gpuarray/types.h>'
]
def
c_init_code
(
self
):
return
[
'setup_ext_cuda();'
]
return
[
'<numpy_compat.h>'
,
'<gpuarray/types.h>'
]
def
c_code
(
self
,
node
,
nodename
,
inp
,
out
,
sub
):
dtype_x
=
node
.
inputs
[
0
]
.
dtype
...
...
@@ -892,65 +851,65 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
kernels
=
[]
kname
=
"kSoftmaxWithBias"
k_var
=
"kSoftmaxWithBias_"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x
'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1
'
,
'const
%
s * b'
%
type_b
,
'const ga_size offset_b
'
,
'const ga_ssize sb0
'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm
'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
]
,
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] =
%
s(x[blockIDX * sx0 + tx * sx1])"
%
load_x
,
"buf[tx] +=
%
s(b[tx * sb0])"
%
load_b
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
,
work_sm
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] =
%
s(buf[tx])"
%
write_sm
,
"}"
,
"__syncthreads()"
,
"}"
,
])
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N
'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x
'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1
'
,
'const
%
s * b'
%
type_b
,
'const ga_size offset_b
'
,
'const ga_ssize sb0
'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"
%
s * buf2 = buf + N"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"buf[tx] =
%
s(x[blockIDX * sx0 + tx * sx1])"
%
load_x
,
"buf[tx] +=
%
s(b[tx * sb0])"
%
load_b
,
"buf2[tx] = buf[tx]"
,
"}"
,
"__syncthreads()"
,
inline_softmax
(
'N'
,
'buf'
,
'buf2'
,
'threadIdx.x'
,
'blockDim.x'
,
work_sm
),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){"
,
"sm[blockIDX * sm_s0 + tx * sm_s1] =
%
s(buf[tx])"
%
write_sm
,
"}"
,
"__syncthreads()"
,
"}"
,
])
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
kname
=
"kSoftmaxWithBias_fixed_shared"
k_var
=
"kSoftmaxWithBias_fixed_shared"
+
nodename
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x
'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1
'
,
'const
%
s * b'
%
type_b
,
'const ga_size offset_b
'
,
'const ga_ssize sb0
'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm
'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
]
,
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const
%
s *x_ptr = &x[blockIDX * sx0]"
%
type_x
,
"
%
s *sm_ptr = &sm[blockIDX * sm_s0]"
%
type_sm
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
load_x
,
'sm_ptr'
,
'sm_s1'
,
write_sm
,
'threadIdx.x'
,
'blockDim.x'
,
'b'
,
'sb0'
,
load_b
,
work_sm
),
"__syncthreads()"
,
"}"
,
])
code
=
nvcc_kernel
(
kname
,
params
=
[
'const ga_size M'
,
'const ga_size N
'
,
'const
%
s * x'
%
type_x
,
'const ga_size offset_x
'
,
'const ga_ssize sx0'
,
'const ga_ssize sx1
'
,
'const
%
s * b'
%
type_b
,
'const ga_size offset_b
'
,
'const ga_ssize sb0
'
,
'
%
s * sm'
%
type_sm
,
'const ga_size offset_sm'
,
'const ga_ssize sm_s0'
,
'const ga_ssize sm_s1'
],
body
=
[
"extern __shared__
%
s buf[]"
%
type_acc
,
"x = (const
%
s *)(((char *)x)+offset_x)"
%
type_x
,
"b = (const
%
s *)(((char *)b)+offset_b)"
%
type_b
,
"sm = (
%
s *)(((char *)sm)+offset_sm)"
%
type_sm
,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){"
,
"const
%
s *x_ptr = &x[blockIDX * sx0]"
%
type_x
,
"
%
s *sm_ptr = &sm[blockIDX * sm_s0]"
%
type_sm
,
inline_softmax_fixed_shared
(
'N'
,
'buf'
,
'x_ptr'
,
'sx1'
,
load_x
,
'sm_ptr'
,
'sm_s1'
,
write_sm
,
'threadIdx.x'
,
'blockDim.x'
,
'b'
,
'sb0'
,
load_b
,
work_sm
),
"__syncthreads()"
,
"}"
,
])
kernels
.
append
(
Kernel
(
code
=
code
,
name
=
kname
,
params
=
params
,
flags
=
flags
,
objvar
=
k_var
))
return
kernels
...
...
theano/sandbox/gpuarray/opt.py
浏览文件 @
645557f9
...
...
@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt
()
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
nd
=
2
)
@alpha_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
)
def
local_gpuagemm_alpha_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
@register_opt
()
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
,
nd
=
2
)
@output_merge
(
GpuGemm
,
alpha_in
=
1
,
beta_in
=
4
,
out_in
=
0
)
def
local_gpuagemm_output_merge
(
node
,
*
inputs
):
return
[
gpugemm_no_inplace
(
*
inputs
)]
...
...
theano/sandbox/gpuarray/opt_util.py
浏览文件 @
645557f9
...
...
@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
from
theano.tensor
import
(
DimShuffle
,
get_scalar_constant_value
,
NotScalarConstantError
)
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
from
.basic_ops
import
GpuFromHost
,
HostFromGpu
,
GpuAllocEmpty
from
.elemwise
import
GpuDimShuffle
,
GpuElemwise
_one
=
scal
.
constant
(
numpy
.
asarray
(
1.0
,
dtype
=
'float64'
))
def
grab_cpu_scalar
(
v
,
nd
):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v : variable
Theano variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if
v
.
owner
is
not
None
:
n
=
v
.
owner
if
(
isinstance
(
n
.
op
,
GpuDimShuffle
)
and
if
(
isinstance
(
n
.
op
,
(
GpuDimShuffle
,
DimShuffle
)
)
and
n
.
op
.
new_order
==
(
'x'
,)
*
nd
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
])
elif
(
isinstance
(
n
.
op
,
DimShuffle
)
and
n
.
op
.
new_order
==
(
'x'
,)
*
nd
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
])
elif
isinstance
(
n
.
op
,
GpuFromHost
):
return
grab_cpu_scalar
(
n
.
inputs
[
0
],
nd
=
nd
)
return
grab_cpu_scalar
(
n
.
inputs
[
0
],
n
.
inputs
[
0
]
.
ndim
)
elif
isinstance
(
n
.
op
,
(
GpuFromHost
,
HostFromGpu
)):
return
grab_cpu_scalar
(
n
.
inputs
[
0
],
nd
)
else
:
return
None
else
:
...
...
@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
def
find_node
(
v
,
cls
,
ignore_clients
=
False
):
# This digs through possibly redundant transfers to for the node
# that has the op class specified. If ignore_clients is False (the
# default) it will only dig through nodes that have a single
# client.
"""
Find the node that has an op of of type `cls` in `v`.
This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v : variable
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if
v
.
owner
is
not
None
and
(
ignore_clients
or
len
(
v
.
clients
)
==
1
):
if
isinstance
(
v
.
owner
.
op
,
cls
):
return
v
.
owner
...
...
@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
def
is_equal
(
var
,
val
):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
"""
Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var : variable
Variable to compare
val : value
Python value
"""
try
:
v
=
get_scalar_constant_value
(
var
)
return
v
==
val
...
...
@@ -59,7 +97,57 @@ def is_equal(var, val):
return
False
def
alpha_merge
(
cls
,
alpha_in
,
beta_in
,
nd
):
def
alpha_merge
(
cls
,
alpha_in
,
beta_in
):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of scal * <yourop>(some, params, alpha,
beta) and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def
wrapper
(
maker
):
@local_optimizer
([
GpuElemwise
])
@wraps
(
maker
)
...
...
@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
targ
=
find_node
(
node
.
inputs
[
0
],
cls
)
if
targ
is
None
:
targ
=
find_node
(
node
.
inputs
[
1
],
cls
)
lr
=
grab_cpu_scalar
(
node
.
inputs
[
0
],
nd
=
nd
)
if
targ
is
None
:
return
lr
=
grab_cpu_scalar
(
node
.
inputs
[
0
],
nd
=
targ
.
outputs
[
0
]
.
ndim
)
else
:
lr
=
grab_cpu_scalar
(
node
.
inputs
[
1
],
nd
=
nd
)
if
(
lr
is
None
or
targ
is
None
or
lr
.
dtype
!=
targ
.
outputs
[
0
]
.
dtype
)
:
lr
=
grab_cpu_scalar
(
node
.
inputs
[
1
],
nd
=
targ
.
outputs
[
0
]
.
ndim
)
if
lr
is
None
or
lr
.
dtype
!=
targ
.
outputs
[
0
]
.
dtype
:
return
None
inputs
=
list
(
targ
.
inputs
)
try
:
...
...
@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
return
wrapper
def
output_merge
(
cls
,
alpha_in
,
beta_in
,
out_in
,
nd
):
def
output_merge
(
cls
,
alpha_in
,
beta_in
,
out_in
):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of val * <yourop>(some, params, alpha,
beta, out_like) and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def
wrapper
(
maker
):
@local_optimizer
([
GpuElemwise
])
@wraps
(
maker
)
...
...
@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
return
maker
(
targ
,
*
inputs
)
return
opt
return
wrapper
def
inplace_allocempty
(
op
,
idx
):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature:
maker(node, inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as:
def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs)
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
This returns an unregistered inplace local optimizer that has the
same name as the decorated function.
"""
def
wrapper
(
maker
):
@local_optimizer
([
op
],
inplace
=
True
)
@wraps
(
maker
)
def
opt
(
node
):
if
type
(
node
.
op
)
!=
op
or
node
.
op
.
inplace
:
return
inputs
=
list
(
node
.
inputs
)
alloc
=
inputs
[
idx
]
if
(
alloc
.
owner
and
isinstance
(
alloc
.
owner
.
op
,
GpuAllocEmpty
)
and
len
(
alloc
.
clients
)
>
1
):
alloc_op
=
GpuAllocEmpty
(
alloc
.
owner
.
op
.
dtype
)
inputs
[
idx
]
=
alloc_op
(
*
alloc
.
owner
.
inputs
)
return
maker
(
node
,
inputs
)
return
opt
return
wrapper
theano/sandbox/gpuarray/subtensor.py
浏览文件 @
645557f9
...
...
@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def
_f16_ok
(
self
):
return
self
.
iadd_node
.
op
.
_f16_ok
def
c_header_dirs
(
self
):
cuda_root
=
config
.
cuda
.
root
if
cuda_root
:
return
[
os
.
path
.
join
(
cuda_root
,
'include'
)]
else
:
return
[]
def
c_headers
(
self
):
return
self
.
iadd_node
.
op
.
c_headers
()
def
c_compiler
(
self
):
return
self
.
iadd_node
.
op
.
c_compiler
()
def
c_init_code
(
self
):
return
self
.
iadd_node
.
op
.
c_init_code
()
...
...
@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version
=
self
.
iadd_node
.
c_code_cache_version
()
if
not
parent_version
or
not
elemwise_version
:
return
return
parent_version
+
elemwise_version
+
(
2
,)
return
parent_version
+
elemwise_version
+
(
3
,)
class
GpuAdvancedSubtensor1
(
HideC
,
tensor
.
AdvancedSubtensor1
):
...
...
theano/sandbox/gpuarray/tests/test_basic_ops.py
浏览文件 @
645557f9
import
unittest
from
theano.compat
import
izip
from
copy
import
copy
,
deepcopy
from
six
import
iteritems
...
...
@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file
from
theano.tensor.tests
import
test_basic
from
theano.tensor.tests.test_basic
import
rand
,
safe_make_node
from
theano.tests
import
unittest_tools
as
utt
from
theano.tests.unittest_tools
import
SkipTest
import
theano.sandbox.gpuarray
from
..type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
..basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
HostFromGpu
,
GpuFromHost
,
GpuReshape
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
GpuContiguous
,
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
from
..subtensor
import
GpuSubtensor
import
theano.sandbox.cuda
as
cuda_ndarray
try
:
from
pygpu
import
gpuarray
except
:
pass
if
theano
.
sandbox
.
gpuarray
.
pygpu
is
None
:
raise
SkipTest
(
"pygpu not installed"
)
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
and
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
if
not
cuda_ndarray
.
use
.
device_number
:
# We should not enable all the use like the flag device=gpu,
...
...
@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if
not
theano
.
sandbox
.
gpuarray
.
pygpu_activated
:
raise
SkipTest
(
"pygpu disabled"
)
from
..type
import
(
GpuArrayType
,
gpuarray_shared_constructor
)
from
..basic_ops
import
(
host_from_gpu
,
gpu_from_host
,
gpu_alloc
,
GpuAlloc
,
GpuAllocEmpty
,
gpu_from_cuda
,
cuda_from_gpu
,
HostFromGpu
,
GpuContiguous
,
GpuFromHost
,
GpuReshape
,
gpu_join
,
GpuJoin
,
GpuSplit
,
GpuEye
,
gpu_contiguous
)
from
..subtensor
import
GpuSubtensor
from
theano.tests
import
unittest_tools
as
utt
utt
.
seed_rng
()
rng
=
numpy
.
random
.
RandomState
(
seed
=
utt
.
fetch_seed
())
from
pygpu
import
gpuarray
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
including
(
'gpuarray'
)
.
excluding
(
'gpu'
)
mode_without_gpu
=
theano
.
compile
.
mode
.
get_mode
(
'FAST_RUN'
)
.
excluding
(
'gpuarray'
)
...
...
@@ -63,22 +61,6 @@ else:
mode_without_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
excluding
(
'gpuarray'
)
def
may_fail
(
msg
,
EClass
):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def
test_decorator
(
f
):
def
wrapper
():
try
:
f
()
except
Exception
as
e
:
if
isinstance
(
e
,
EClass
):
raise
SkipTest
(
msg
,
e
)
raise
wrapper
.
__name__
=
f
.
__name__
return
wrapper
return
test_decorator
def
inplace_func
(
inputs
,
outputs
,
mode
=
None
,
allow_input_downcast
=
False
,
on_unused_input
=
'raise'
,
name
=
None
):
if
mode
is
None
:
...
...
@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else
:
err_msg
=
(
"Test
%
s::
%
s: exception raised during test "
"call was not the same as the reference "
"call (got:
%
s, expected
%
s)"
)
%
\
"call (got:
%
s, expected
%
s)"
%
(
self
.
gpu_op
,
testname
,
type
(
exc
),
type
(
ref_e
))
type
(
ref_e
))
)
exc
.
args
+=
(
err_msg
,)
raise
...
...
@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected
):
self
.
fail
((
"Test
%
s::
%
s: Output
%
s gave the wrong "
"value. With inputs
%
s, expected
%
s "
"(dtype
%
s), got
%
s (dtype
%
s)."
)
%
(
self
.
op
,
testname
,
i
,
inputs
,
expected
,
expected
.
dtype
,
variable
,
variable
.
dtype
))
"(dtype
%
s), got
%
s (dtype
%
s)."
%
(
self
.
op
,
testname
,
i
,
inputs
,
expected
,
expected
.
dtype
,
variable
,
variable
.
dtype
)
))
for
description
,
check
in
iteritems
(
self
.
checks
):
if
not
check
(
inputs
,
variables
):
...
...
@@ -250,36 +232,6 @@ def test_transfer_strided():
assert
numpy
.
all
(
fv
==
av
)
@may_fail
(
"Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way"
,
ValueError
)
def
test_transfer_cuda_gpu
():
import
theano.sandbox.cuda
as
cuda_ndarray
if
cuda_ndarray
.
cuda_available
is
False
:
raise
SkipTest
(
"Can't test interaction with cuda if cuda not present"
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,
False
))(
'g'
)
c
=
cuda_ndarray
.
CudaNdarrayType
((
False
,
False
))(
'c'
)
av
=
theano
.
_asarray
(
rng
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gv
=
gpuarray
.
array
(
av
)
cv
=
cuda_ndarray
.
CudaNdarray
(
av
)
gvs
=
gv
[:,
::
-
2
]
cvs
=
cv
[:,
::
-
2
]
f
=
theano
.
function
([
c
],
gpu_from_cuda
(
c
))
fv
=
f
(
cv
)
assert
GpuArrayType
.
values_eq_approx
(
fv
,
gv
)
fvs
=
f
(
cvs
)
assert
GpuArrayType
.
values_eq_approx
(
fvs
,
gvs
)
f
=
theano
.
function
([
g
],
cuda_from_gpu
(
g
))
fv
=
f
(
gv
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fv
,
cv
)
fvs
=
f
(
gvs
)
assert
cuda_ndarray
.
CudaNdarrayType
.
values_eq_approx
(
fvs
,
cvs
)
def
gpu_alloc_expected
(
x
,
*
shp
):
g
=
gpuarray
.
empty
(
shp
,
dtype
=
x
.
dtype
)
g
[:]
=
x
...
...
@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op
=
gpu_alloc
,
cases
=
dict
(
correct01
=
(
rand
(),
numpy
.
int32
(
7
)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
#
correct01_bcast=(rand(1), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
#
correct01_bcast=(rand(1), numpy.int32(7)),
correct02
=
(
rand
(),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct12
=
(
rand
(
7
),
numpy
.
int32
(
4
),
numpy
.
int32
(
7
)),
correct13
=
(
rand
(
7
),
numpy
.
int32
(
2
),
numpy
.
int32
(
4
),
...
...
@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv
=
gpuarray
.
asarray
(
numpy
.
random
.
rand
(
5
,
4
),
dtype
=
'float32'
)
gpu_from_host
=
theano
.
sandbox
.
gpuarray
.
basic_ops
.
gpu_from_host
host_from_gpu
=
theano
.
sandbox
.
gpuarray
.
basic_ops
.
host_from_gpu
f
=
theano
.
function
([
a
],
gpu_from_host
(
a
),
mode
=
m
)
assert
gpu_from_host
in
[
x
.
op
for
x
in
f
.
maker
.
fgraph
.
toposort
()]
...
...
theano/sandbox/gpuarray/tests/test_blas.py
浏览文件 @
645557f9
...
...
@@ -6,8 +6,7 @@ import numpy
import
theano
from
theano
import
tensor
from
theano.tests
import
unittest_tools
as
utt
from
theano.tensor.blas
import
(
gemv_inplace
,
gemm_inplace
,
ger_destructive
,
_dot22
)
from
theano.tensor.blas
import
gemv_inplace
,
gemm_inplace
,
_dot22
from
theano.tensor.tests.test_blas
import
TestGer
,
BaseGemv
from
..
import
gpuarray_shared_constructor
...
...
@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu
)
from
..blas
import
(
gpugemv_inplace
,
gpugemv_no_inplace
,
gpugemm_inplace
,
gpugemm_no_inplace
,
gpugemm_inplace
,
gpuger_inplace
,
gpuger_no_inplace
,
GpuGer
,
gpu_dot22
,
GpuGemm
)
GpuGemvTester
=
makeTester
(
'GpuGemvTester'
,
op
=
gemv_inplace
,
gpu_op
=
gpugemv_inplace
,
cases
=
dict
(
dot_vv
=
[
rand
(
1
),
1
,
rand
(
1
,
2
),
rand
(
2
),
0
],
dot_vm
=
[
rand
(
3
),
1
,
rand
(
3
,
2
),
rand
(
2
),
0
],
#
test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
#
test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
#
test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride
=
[
rand
(
3
)[::
-
1
],
1
,
rand
(
3
,
2
)[::
-
1
],
rand
(
2
)[::
-
1
],
0
],
)
)
GpuGemvTester
=
makeTester
(
'GpuGemvTester'
,
op
=
gemv_inplace
,
gpu_op
=
gpugemv_inplace
,
cases
=
dict
(
dot_vv
=
[
rand
(
1
),
1
,
rand
(
1
,
2
),
rand
(
2
),
0
],
dot_vm
=
[
rand
(
3
),
1
,
rand
(
3
,
2
),
rand
(
2
),
0
],
#
test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
#
test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
#
test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride
=
[
rand
(
3
)[::
-
1
],
1
,
rand
(
3
,
2
)[::
-
1
],
rand
(
2
)[::
-
1
],
0
],
)
)
class
TestGpuSgemv
(
TestCase
,
BaseGemv
,
utt
.
TestOptimizationMixin
):
...
...
@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return
theano
.
shared
(
val
)
GpuGemmTester
=
makeTester
(
'GpuGemmTester'
,
op
=
gemm_inplace
,
gpu_op
=
gpugemm_inplace
,
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test2
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.0
],
test3
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test4
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test5
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.6
],
test6
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test7
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test8
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.1
],
test9
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.1
],
#
test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
#
test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
#
test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
#
test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
GpuGemmTester
=
makeTester
(
'GpuGemmTester'
,
op
=
gemm_inplace
,
gpu_op
=
gpugemm_inplace
,
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test2
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.0
],
test3
=
[
rand
(
3
,
4
),
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test4
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test5
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.6
],
test6
=
[
rand
(
3
,
4
),
0.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.0
],
test7
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
0.0
],
test8
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
1.1
],
test9
=
[
rand
(
3
,
4
),
-
1.0
,
rand
(
3
,
5
),
rand
(
5
,
4
),
-
1.1
],
#
test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
#
test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
#
test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
#
test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
class
TestGpuSger
(
TestGer
):
...
...
@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def
test_f32_0_0
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
def
test_f32_1_0
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
def
test_f32_0_1
(
self
):
raise
SkipTest
(
'0-sized objects not supported'
)
...
...
@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester
=
makeTester
(
'Gpu
Gemm
Tester'
,
'Gpu
Dot22
Tester'
,
op
=
_dot22
,
gpu_op
=
gpu_dot22
,
cases
=
dict
(
test1
=
[
rand
(
3
,
4
),
rand
(
4
,
5
)],
test2
=
[
rand
(
1
,
4
),
rand
(
4
,
5
)],
test3
=
[
rand
(
3
,
1
),
rand
(
1
,
5
)],
test4
=
[
rand
(
3
,
4
),
rand
(
4
,
1
)],
#
test5=[rand(0, 4), rand(4, 5)],
#
test6=[rand(3, 0), rand(0, 5)],
#
test7=[rand(3, 4), rand(4, 0)],
#
test8=[rand(0, 4), rand(4, 0)],
#
test9=[rand(0, 0), rand(0, 0)],
#
test5=[rand(0, 4), rand(4, 5)],
#
test6=[rand(3, 0), rand(0, 5)],
#
test7=[rand(3, 4), rand(4, 0)],
#
test8=[rand(0, 4), rand(4, 0)],
#
test9=[rand(0, 0), rand(0, 0)],
)
)
def
test_hgemm_swap
():
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
...
...
@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt
.
assert_allclose
(
of
,
on
)
def
test_hgemm_alpha_output_merge
():
from
theano.sandbox.cuda
import
nvcc_compiler
if
nvcc_compiler
.
nvcc_version
<
'7.5'
:
...
...
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
浏览文件 @
645557f9
...
...
@@ -6,37 +6,36 @@ import sys
import
time
import
unittest
import
numpy
from
six.moves
import
xrange
from
nose.plugins.skip
import
SkipTest
imported_scipy_convolve2d
=
False
try
:
from
scipy.signal
import
convolve2d
imported_scipy_convolve2d
=
True
except
ImportError
:
pass
import
theano
from
theano
import
tensor
from
theano.tests.unittest_tools
import
seed_rng
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
from
.test_basic_ops
import
mode_with_gpu
from
..type
import
GpuArrayType
from
..conv
import
GpuConv
from
theano.sandbox.gpuarray
import
dnn
import
pygpu
imported_scipy_convolve2d
=
False
try
:
from
scipy.signal
import
convolve2d
imported_scipy_convolve2d
=
True
except
ImportError
:
pass
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
def
py_conv_valid_numpy
(
img
,
kern
):
assert
img
.
shape
[
1
]
==
kern
.
shape
[
1
]
outshp
=
(
img
.
shape
[
0
],
kern
.
shape
[
0
],
img
.
shape
[
2
]
-
kern
.
shape
[
2
]
+
1
,
img
.
shape
[
3
]
-
kern
.
shape
[
3
]
+
1
)
img
.
shape
[
2
]
-
kern
.
shape
[
2
]
+
1
,
img
.
shape
[
3
]
-
kern
.
shape
[
3
]
+
1
)
out
=
numpy
.
zeros
(
outshp
,
dtype
=
'float32'
)
for
b
in
xrange
(
out
.
shape
[
0
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
...
...
@@ -60,7 +59,7 @@ def py_conv_full_numpy(img, kern):
padded_img
=
numpy
.
zeros
((
img
.
shape
[
0
],
img
.
shape
[
1
],
pad_rows
,
pad_cols
),
dtype
=
img
.
dtype
)
padded_img
[:,
:,
kern
.
shape
[
2
]
-
1
:
kern
.
shape
[
2
]
-
1
+
img
.
shape
[
2
],
kern
.
shape
[
3
]
-
1
:
kern
.
shape
[
3
]
-
1
+
img
.
shape
[
3
]]
=
img
kern
.
shape
[
3
]
-
1
:
kern
.
shape
[
3
]
-
1
+
img
.
shape
[
3
]]
=
img
return
py_conv_valid_numpy
(
padded_img
,
kern
)
...
...
@@ -73,10 +72,10 @@ def py_conv(img, kern, mode, subsample):
return
py_conv_scipy
(
img
,
kern
,
mode
,
subsample
)
elif
mode
==
'valid'
:
return
py_conv_valid_numpy
(
img
,
kern
)[:,
:,
::
subsample
[
0
],
::
subsample
[
1
]]
::
subsample
[
1
]]
elif
mode
==
'full'
:
return
py_conv_full_numpy
(
img
,
kern
)[:,
:,
::
subsample
[
0
],
::
subsample
[
1
]]
::
subsample
[
1
]]
else
:
raise
Exception
(
"Can't execute this kernel."
)
...
...
@@ -85,12 +84,12 @@ def py_conv_scipy(img, kern, mode, subsample):
assert
img
.
shape
[
1
]
==
kern
.
shape
[
1
]
if
mode
==
'valid'
:
outshp
=
(
img
.
shape
[
0
],
kern
.
shape
[
0
],
img
.
shape
[
2
]
-
kern
.
shape
[
2
]
+
1
,
img
.
shape
[
3
]
-
kern
.
shape
[
3
]
+
1
)
img
.
shape
[
2
]
-
kern
.
shape
[
2
]
+
1
,
img
.
shape
[
3
]
-
kern
.
shape
[
3
]
+
1
)
else
:
outshp
=
(
img
.
shape
[
0
],
kern
.
shape
[
0
],
img
.
shape
[
2
]
+
kern
.
shape
[
2
]
-
1
,
img
.
shape
[
3
]
+
kern
.
shape
[
3
]
-
1
)
img
.
shape
[
2
]
+
kern
.
shape
[
2
]
-
1
,
img
.
shape
[
3
]
+
kern
.
shape
[
3
]
-
1
)
out
=
numpy
.
zeros
(
outshp
,
dtype
=
'float32'
)
for
b
in
xrange
(
out
.
shape
[
0
]):
for
k
in
xrange
(
out
.
shape
[
1
]):
...
...
@@ -133,9 +132,9 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
dtype
=
'float32'
)
else
:
npy_img
=
theano
.
_asarray
(
numpy
.
arange
(
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
dtype
=
'float32'
)
+
1
numpy
.
prod
(
ishape
))
.
reshape
(
ishape
),
dtype
=
'float32'
)
+
1
npy_kern
=
-
(
theano
.
_asarray
(
numpy
.
arange
(
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
numpy
.
prod
(
kshape
))
.
reshape
(
kshape
),
dtype
=
'float32'
)
+
1
)
img
=
pygpu
.
array
(
npy_img
)
kern
=
pygpu
.
array
(
npy_kern
)
...
...
@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
cpu_mflops
=
approx_fp
/
(
t1
-
t0
)
gpu_mflops
=
approx_fp
/
(
t2
-
t1
)
if
verbose
>
0
:
print
(
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
end
=
' '
,
file
=
sys
.
stdout
)
print
(
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
)),
file
=
sys
.
stdout
)
print
(
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
end
=
' '
,
file
=
sys
.
stdout
)
print
(
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
)),
file
=
sys
.
stdout
)
if
not
rval
:
print
(
(
'test_'
+
mode
+
' id='
+
str
(
id
)
+
' FAILED for ishape, kshape, mode, subsample,'
+
' img_stride, kern_stride, version'
,
ishape
,
kshape
,
mode
,
subsample
,
img_stride
,
kern_stride
,
version
)
,
file
=
sys
.
stdout
)
print
(
'test_'
+
mode
+
' id='
+
str
(
id
)
+
' FAILED for ishape, kshape, mode, subsample,'
+
' img_stride, kern_stride, version'
,
ishape
,
kshape
,
mode
,
subsample
,
img_stride
,
kern_stride
,
version
,
file
=
sys
.
stdout
)
diff
=
cpuval
-
gpuval
diffabs
=
numpy
.
absolute
(
diff
)
pr_diff
=
diffabs
/
numpy
.
absolute
(
cpuval
)
...
...
@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
nb_close
,
"/"
,
diff
.
size
))
print
(
"max relatif diff:"
,
(
pr_diff
.
max
(),
"avg rel diff:"
,
numpy
.
average
(
pr_diff
)))
if
not
rval
and
print_
!=
False
:
if
not
rval
and
print_
is
not
False
:
if
npy_img
.
shape
[
0
]
>
5
:
print
(
"img"
,
npy_img
[
0
])
print
(
"kern"
,
npy_kern
[
0
])
...
...
@@ -242,19 +243,20 @@ def exec_conv(version, shapes, verbose, random, mode,
istride
,
kstride
)
in
enumerate
(
shapes
):
ret
=
False
try
:
ret
=
_params_allgood
(
ishape
,
kshape
,
mode
,
subsample
=
subshape
,
img_stride
=
istride
,
kern_stride
=
kstride
,
version
=
ver
,
verbose
=
verbose
,
random
=
random
,
id
=
id
,
print_
=
print_
,
rtol
=
rtol
,
ones
=
ones
)
ret
=
_params_allgood
(
ishape
,
kshape
,
mode
,
subsample
=
subshape
,
img_stride
=
istride
,
kern_stride
=
kstride
,
version
=
ver
,
verbose
=
verbose
,
random
=
random
,
id
=
id
,
print_
=
print_
,
rtol
=
rtol
,
ones
=
ones
)
except
Exception
as
e
:
print
(
ver
,
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
print
(
e
)
...
...
@@ -273,11 +275,11 @@ def exec_conv(version, shapes, verbose, random, mode,
def
get_basic_shapes
():
# basic test of image and kernel shape
# basic test of image and kernel shape
return
[((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
3
,
3
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# basic test for unsquare kernel and image
# basic test for unsquare kernel and image
((
1
,
1
,
2
,
4
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
3
,
4
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
4
,
3
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
...
...
@@ -297,17 +299,17 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((
3
,
1
)
+
imshp
,
(
1
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
# nkern only
((
1
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and nkern
#
batch and nkern
((
3
,
1
)
+
imshp
,
(
2
,
1
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch and stack
#
batch and stack
((
3
,
2
)
+
imshp
,
(
1
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#stack and nkern
#
stack and nkern
((
1
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
2
,
2
)
+
imshp
,
(
2
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
),
#batch, nkern and stack
#
batch, nkern and stack
((
3
,
2
)
+
imshp
,
(
4
,
2
)
+
kshp
,
subsample
,
img_stride
,
kern_stride
)
]
]
def
get_shapes2
(
scales_img
=
(
1
,
1
),
scales_kern
=
(
1
,
1
),
subsample
=
(
1
,
1
),
...
...
@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
def
get_valid_shapes
():
# img shape, kern shape, subsample shape
shapes
=
get_basic_shapes
()
...
...
@@ -360,39 +361,36 @@ def get_valid_shapes():
# test subsample done in a separate fct
shapes
+=
[
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
,
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# added a test case that fail from test_nnet.py.test_conv_nnet2
,
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
))
# test precedent but reduced that triger the error
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
]
shapes
+=
[
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 1 layers
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 2 layers
,
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
,
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 1 layers
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 2 layers
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
]
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# layer 1 backprop to weights
((
60
,
20
,
28
,
28
),
(
10
,
20
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# added a test case that fail from test_nnet.py.test_conv_nnet2
((
10
,
5
,
28
,
28
),
(
10
,
5
,
5
,
5
),
(
1
,
1
),
(
2
,
2
),
(
1
,
1
)),
# test precedent but reduced that triger the error
# Test more than maxThreadsDim0
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
shapes
+=
[((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 1 layers
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 2 layers
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 2 valid
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 1 layers
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 2 layers
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 full
]
return
shapes
...
...
@@ -428,43 +426,35 @@ def test_full():
shapes
+=
[
# other test
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize, non-square image, non-square kern
,
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
,
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# a big one
,
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# MNIST LeNET layer 1
,
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# layer 1 backprop to weights
((
2
,
1
,
2
,
2
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
4
,
4
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
2
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
4
,
4
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
10
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
1
,
20
,
10
),
(
1
,
1
,
2
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
4
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize, non-square image, non-square kern
((
3
,
2
,
8
,
6
),
(
4
,
2
,
4
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((
16
,
5
,
64
,
64
),
(
8
,
5
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# a big one
((
16
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# MNIST LeNET layer 1
((
20
,
16
,
32
,
32
),
(
1
,
16
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# layer 1 backprop to weights
# other test
,
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# kernel bigger then image
,
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# kernel bigger then image
,
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
]
((
3
,
1
,
1
,
1
),
(
2
,
1
,
5
,
3
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# kernel bigger then image
((
3
,
2
,
1
,
1
),
(
4
,
2
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
4
,
4
),
(
4
,
2
,
2
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
3
,
2
,
4
,
4
),
(
4
,
2
,
8
,
6
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# kernel bigger then image
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
12
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
shapes
+=
[
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_28 bprop 1 full
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
# Test more than maxThreadsDim0
,
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
]
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_28 bprop 1 full
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# test_lenet_64 full
# Test more than maxThreadsDim0
((
2
,
4
,
13
,
1050
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
2
,
4
,
1050
,
13
),
(
3
,
4
,
10
,
11
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
]
version
=
[
-
1
]
verbose
=
0
...
...
@@ -481,7 +471,7 @@ def test_subsample():
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
2
),
(
1
,
3
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
2
),
(
3
,
3
),
(
1
,
1
),
(
1
,
1
)),
((
4
,
2
,
10
,
10
),
(
3
,
2
,
2
,
2
),
(
3
,
1
),
(
1
,
1
),
(
1
,
1
))
]
]
shapes
+=
get_shapes2
(
scales_img
=
(
2
,
2
),
subsample
=
(
1
,
1
))
shapes
+=
get_shapes2
(
scales_img
=
(
2
,
2
),
subsample
=
(
1
,
2
))
shapes
+=
get_shapes2
(
scales_img
=
(
2
,
2
),
subsample
=
(
2
,
1
))
...
...
@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
for
mode
in
[
'valid'
,
'full'
]:
for
shapes
in
[((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
8
,
8
)),
((
3
,
2
,
8
,
8
),
(
4
,
2
,
5
,
5
),
(
5
,
8
)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns.
]:
...
...
@@ -580,47 +569,45 @@ def benchmark():
shapes_valid
=
[
# test_lenet_28 shape
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
12
,
12
),
(
30
,
60
,
8
,
8
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
,
# valid
((
60
,
20
,
12
,
12
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
60
,
1
,
28
,
28
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
1
,
60
,
28
,
28
),
(
20
,
60
,
24
,
24
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_32 shape
,
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
20
,
60
,
14
,
14
),
(
30
,
60
,
10
,
10
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
60
,
20
,
14
,
14
),
(
30
,
20
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
60
,
1
,
32
,
32
),
(
20
,
1
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
1
,
60
,
32
,
32
),
(
20
,
60
,
28
,
28
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_64 shape
,
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
29
,
29
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
20
,
10
,
29
,
29
),
(
30
,
10
,
23
,
23
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
10
,
1
,
64
,
64
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
1
,
10
,
64
,
64
),
(
20
,
10
,
58
,
58
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_108 shape
,
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
((
10
,
20
,
51
,
51
),
(
30
,
20
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
20
,
10
,
51
,
51
),
(
30
,
10
,
45
,
45
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
10
,
1
,
108
,
108
),
(
20
,
1
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
1
,
10
,
108
,
108
),
(
20
,
10
,
102
,
102
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
# test_lenet_256 shape
,
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
,
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# valid
]
((
2
,
20
,
124
,
124
),
(
30
,
20
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
20
,
2
,
124
,
124
),
(
30
,
2
,
116
,
116
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
2
,
1
,
256
,
256
),
(
20
,
1
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
((
1
,
2
,
256
,
256
),
(
20
,
2
,
248
,
248
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# valid
]
shapes_full
=
[
# test_lenet_28 shape
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full
((
60
,
30
,
8
,
8
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full
# test_lenet_32 shape
,
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=1
((
60
,
30
,
10
,
10
),
(
20
,
30
,
5
,
5
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape
,
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_full_patch_stack_padded' N=3
((
10
,
30
,
23
,
23
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape
,
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full 'conv_full_patch_stack_padded' N=9
((
10
,
30
,
45
,
45
),
(
20
,
30
,
7
,
7
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape
,
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
# full conv_reference_full
]
((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
# full conv_reference_full
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version
=
[
-
1
]
verbose
=
1
random
=
True
...
...
@@ -640,6 +627,6 @@ def test_stack_rows_segfault_070312():
kern
=
theano
.
shared
(
numpy
.
random
.
rand
(
1
,
80
,
9
,
9
)
.
astype
(
'float32'
))
out
=
theano
.
shared
(
numpy
.
random
.
rand
(
1
,
2
,
2
,
3
)
.
astype
(
'float32'
))
op
=
theano
.
tensor
.
nnet
.
conv
.
ConvOp
(
imshp
=
(
80
,
96
,
96
),
kshp
=
(
9
,
9
),
nkern
=
1
,
bsize
=
1
)
nkern
=
1
,
bsize
=
1
)
f
=
theano
.
function
([],
[],
updates
=
[(
out
,
op
(
img
,
kern
))],
mode
=
mode_with_gpu
)
f
()
theano/sandbox/gpuarray/tests/test_neighbours.py
浏览文件 @
645557f9
import
unittest
from
theano.tensor.nnet.tests
import
test_neighbours
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
from
.test_basic_ops
import
mode_with_gpu
from
..neighbours
import
GpuImages2Neibs
...
...
theano/sandbox/gpuarray/tests/test_nnet.py
浏览文件 @
645557f9
from
__future__
import
print_function
from
nose.plugins.skip
import
SkipTest
import
numpy
import
unittest
...
...
@@ -7,8 +7,6 @@ import theano
import
theano.tensor
as
T
import
theano.tests.unittest_tools
as
utt
from
theano.sandbox
import
gpuarray
# We let that import do the init of the back-end if needed.
from
.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
...
...
@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in
=
4098
n_out
=
4099
x
=
T
.
fmatrix
(
'x'
)
y
=
T
.
lvector
(
'y'
)
b
=
T
.
fvector
(
'b'
)
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#
(the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# case.
dot_result
=
T
.
fmatrix
(
'dot_result'
)
...
...
@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
batch_size
,
n_in
),
dtype
=
numpy
.
float32
)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy
=
numpy
.
ones
((
batch_size
,),
dtype
=
'int32'
)
b_values
=
numpy
.
zeros
((
n_out
,),
dtype
=
'float32'
)
W_values
=
numpy
.
asarray
(
numpy
.
random
.
rand
(
n_in
,
n_out
),
dtype
=
'float32'
)
...
...
@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu
=
theano
.
function
(
inputs
=
[
y
,
b
,
dot_result
],
outputs
=
[
loss
,
y_pred
,
dW
],
mode
=
mode_with_gpu
)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert
any
([
isinstance
(
node
.
op
,
T
.
nnet
.
CrossentropySoftmaxArgmax1HotWithBias
)
...
...
@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads
"""
n_in
=
1000
batch_size
=
4097
n_out
=
1250
if
not
isinstance
(
mode_with_gpu
,
theano
.
compile
.
DebugMode
):
n_in
=
4098
n_out
=
4099
# Seed numpy.random with config.unittests.rseed
...
...
@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol
=
1e-5
atol
=
1e-6
if
not
numpy
.
allclose
(
cpu_out
,
gpu_out
,
rtol
=
rtol
,
atol
=
atol
):
abs_err
,
rel_err
=
T
.
numeric_grad
.
abs_rel_err
(
cpu_out
,
gpu_out
)
scaled_err
=
numpy
.
minimum
(
abs_err
/
atol
,
rel_err
/
rtol
)
max_i
=
scaled_err
.
argmax
()
print
(
'max err index:'
,
max_i
,
max_i
/
batch_size
,
end
=
' '
)
print
(
max_i
%
batch_size
,
max_i
/
n_out
,
max_i
&
n_out
)
print
(
'At that index:'
)
print
(
'err:'
,
scaled_err
.
flatten
()[
max_i
])
print
(
'absolute error:'
,
abs_err
.
flatten
()[
max_i
])
print
(
'relative error:'
,
rel_err
.
flatten
()[
max_i
])
print
(
'cpu_out:'
,
cpu_out
.
flatten
()[
max_i
])
print
(
'gpu_out:'
,
gpu_out
.
flatten
()[
max_i
])
print
(
'softmax_output_value:'
,
softmax_output_value
.
flatten
()[
max_i
])
print
(
'dnll_value:'
,
dnll_value
[
max_i
/
n_out
])
print
(
'y_idx_value:'
,
y_idx_value
[
max_i
/
n_out
])
assert
False
,
"numpy.allclose(cpu_out, gpu_out, rtol=
%
s, atol=
%
s)"
%
(
rtol
,
atol
)
utt
.
assert_allclose
(
cpu_out
,
gpu_out
,
rtol
=
rtol
,
atol
=
atol
)
def
test_softmax_with_bias_float16
():
...
...
@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
dtypeBias
=
'float16'
)
def
test_softmax_with_bias_float32
():
softmax_with_bias_unittest_template
(
dtypeInput
=
'float32'
,
dtypeBias
=
'float32'
)
...
...
@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED)
"""
x
=
T
.
matrix
(
'x'
,
dtype
=
dtypeInput
)
b
=
T
.
vector
(
'b'
,
dtype
=
dtypeBias
)
...
...
@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def
test_softmax_float16
():
softmax_unittest_template
(
'float16'
)
def
test_softmax_float32
():
softmax_unittest_template
(
'float32'
)
def
test_softmax_float64
():
softmax_unittest_template
(
'float64'
)
...
...
theano/sandbox/gpuarray/tests/test_type.py
浏览文件 @
645557f9
import
operator
import
numpy
import
theano
...
...
@@ -25,7 +23,6 @@ def test_deep_copy():
def
test_values_eq_approx
():
a
=
rand_gpuarray
(
20
,
dtype
=
'float32'
)
g
=
GpuArrayType
(
dtype
=
'float32'
,
broadcastable
=
(
False
,))(
'g'
)
assert
GpuArrayType
.
values_eq_approx
(
a
,
a
)
b
=
a
.
copy
()
b
[
0
]
=
numpy
.
asarray
(
b
[
0
])
+
1.
...
...
theano/sandbox/gpuarray/type.py
浏览文件 @
645557f9
...
...
@@ -184,7 +184,7 @@ class GpuArrayType(Type):
@staticmethod
def
may_share_memory
(
a
,
b
):
if
(
not
isinstance
(
a
,
gpuarray
.
GpuArray
)
or
not
isinstance
(
b
,
gpuarray
.
GpuArray
)):
not
isinstance
(
b
,
gpuarray
.
GpuArray
)):
return
False
return
pygpu
.
gpuarray
.
may_share_memory
(
a
,
b
)
...
...
@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self
.
broadcastable
==
other
.
broadcastable
)
def
convert_variable
(
self
,
var
):
if
(
type
(
self
)
==
type
(
var
.
type
)
and
self
.
typecode
==
var
.
type
.
typecode
and
self
.
ndim
==
var
.
type
.
ndim
and
vt
=
var
.
type
if
(
type
(
self
)
==
type
(
vt
)
and
self
.
typecode
==
vt
.
typecode
and
self
.
ndim
==
vt
.
ndim
and
all
(
sb
==
ob
or
ob
for
sb
,
ob
in
zip
(
self
.
broadcastable
,
v
ar
.
type
.
broadcastable
))):
v
t
.
broadcastable
))):
return
theano
.
tensor
.
patternbroadcast
(
var
,
self
.
broadcastable
)
def
__hash__
(
self
):
...
...
theano/tests/test_flake8.py
浏览文件 @
645557f9
...
...
@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py"
,
"sandbox/linalg/__init__.py"
,
"sandbox/linalg/tests/test_linalg.py"
,
"sandbox/gpuarray/basic_ops.py"
,
"sandbox/gpuarray/nnet.py"
,
"sandbox/gpuarray/elemwise.py"
,
"sandbox/gpuarray/type.py"
,
"sandbox/gpuarray/__init__.py"
,
"sandbox/gpuarray/kernel_codegen.py"
,
"sandbox/gpuarray/conv.py"
,
"sandbox/gpuarray/neighbours.py"
,
"sandbox/gpuarray/tests/test_subtensor.py"
,
"sandbox/gpuarray/tests/test_scan.py"
,
"sandbox/gpuarray/tests/test_neighbours.py"
,
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py"
,
"sandbox/gpuarray/tests/test_type.py"
,
"sandbox/gpuarray/tests/test_opt.py"
,
"sandbox/gpuarray/tests/test_blas.py"
,
"sandbox/gpuarray/tests/test_elemwise.py"
,
"sandbox/gpuarray/tests/test_nnet.py"
,
"sandbox/gpuarray/tests/test_basic_ops.py"
,
"scan_module/scan_utils.py"
,
"scan_module/scan_views.py"
,
"scan_module/scan.py"
,
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论