Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
239b6d80
提交
239b6d80
authored
9月 23, 2014
作者:
Frédéric Bastien
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2117 from abergeron/dnn
Add support for subsampling and gradient to GpuDnnConv
上级
025d484e
dedc323d
隐藏空白字符变更
内嵌
并排
正在显示
6 个修改的文件
包含
677 行增加
和
422 行删除
+677
-422
MANIFEST.in
MANIFEST.in
+1
-0
setup.py
setup.py
+1
-1
test_types.py
theano/gof/tests/test_types.py
+67
-1
type.py
theano/gof/type.py
+98
-2
dnn.py
theano/sandbox/cuda/dnn.py
+287
-140
test_conv_cuda_ndarray.py
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+223
-278
没有找到文件。
MANIFEST.in
浏览文件 @
239b6d80
...
@@ -2,6 +2,7 @@ global-include *.txt
...
@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c
global-include *.c
global-include *.cu
global-include *.cu
global-include *.cuh
global-include *.cuh
global-include *.h
global-include *.sh
global-include *.sh
global-include *.pkl
global-include *.pkl
recursive-include docs
recursive-include docs
...
...
setup.py
浏览文件 @
239b6d80
...
@@ -193,7 +193,7 @@ def do_setup():
...
@@ -193,7 +193,7 @@ def do_setup():
install_requires
=
[
'numpy>=1.5.0'
,
'scipy>=0.7.2'
],
install_requires
=
[
'numpy>=1.5.0'
,
'scipy>=0.7.2'
],
package_data
=
{
package_data
=
{
''
:
[
'*.txt'
,
'*.rst'
,
'*.cu'
,
'*.cuh'
,
'*.c'
,
'*.sh'
,
'*.pkl'
,
''
:
[
'*.txt'
,
'*.rst'
,
'*.cu'
,
'*.cuh'
,
'*.c'
,
'*.sh'
,
'*.pkl'
,
'ChangeLog'
],
'
*.h'
,
'
ChangeLog'
],
'theano.misc'
:
[
'*.sh'
]
'theano.misc'
:
[
'*.sh'
]
},
},
scripts
=
[
'bin/theano-cache'
,
'bin/theano-nose'
,
'bin/theano-test'
],
scripts
=
[
'bin/theano-cache'
,
'bin/theano-nose'
,
'bin/theano-test'
],
...
...
theano/gof/tests/test_types.py
浏览文件 @
239b6d80
import
numpy
import
theano
from
theano.gof.type
import
*
from
theano
import
Op
,
Apply
from
theano.tensor
import
TensorType
from
theano.gof.type
import
CDataType
# todo: test generic
# todo: test generic
class
ProdOp
(
Op
):
__props__
=
()
def
make_node
(
self
,
i
):
return
Apply
(
self
,
[
i
],
[
CDataType
(
'void *'
,
'py_decref'
)()])
def
c_support_code
(
self
):
return
"""
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def
c_code
(
self
,
node
,
name
,
inps
,
outs
,
sub
):
return
"""
Py_XDECREF(
%(out)
s);
%(out)
s = (void *)
%(inp)
s;
Py_INCREF(
%(inp)
s);
"""
%
dict
(
out
=
outs
[
0
],
inp
=
inps
[
0
])
def
c_code_cache_version
(
self
):
return
(
0
,)
class
GetOp
(
Op
):
__props__
=
()
def
make_node
(
self
,
c
):
return
Apply
(
self
,
[
c
],
[
TensorType
(
'float32'
,
(
False
,))()])
def
c_support_code
(
self
):
return
"""
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def
c_code
(
self
,
node
,
name
,
inps
,
outs
,
sub
):
return
"""
Py_XDECREF(
%(out)
s);
%(out)
s = (PyArrayObject *)
%(inp)
s;
Py_INCREF(
%(out)
s);
"""
%
dict
(
out
=
outs
[
0
],
inp
=
inps
[
0
])
def
c_code_cache_version
(
self
):
return
(
0
,)
def
test_cdata
():
i
=
TensorType
(
'float32'
,
(
False
,))()
c
=
ProdOp
()(
i
)
i2
=
GetOp
()(
c
)
# This should be a passthrough function for vectors
f
=
theano
.
function
([
i
],
i2
)
v
=
numpy
.
random
.
randn
(
9
)
.
astype
(
'float32'
)
v2
=
f
(
v
)
assert
(
v2
==
v
)
.
all
()
theano/gof/type.py
浏览文件 @
239b6d80
...
@@ -2,6 +2,8 @@
...
@@ -2,6 +2,8 @@
__docformat__
=
"restructuredtext en"
__docformat__
=
"restructuredtext en"
from
theano.compat
import
PY3
from
theano.gof
import
utils
from
theano.gof
import
utils
from
theano.gof.utils
import
MethodNotDefined
,
object2
from
theano.gof.utils
import
MethodNotDefined
,
object2
from
theano.gof
import
graph
from
theano.gof
import
graph
...
@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject):
...
@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject):
c_extract_code
=
self
.
c_extract
(
name
,
sub
,
check_input
))
c_extract_code
=
self
.
c_extract
(
name
,
sub
,
check_input
))
def
c_cleanup
(
self
,
name
,
sub
):
def
c_cleanup
(
self
,
name
,
sub
):
"""
Optional:
Return c code to clean up after `c_extract`.
"""Return c code to clean up after `c_extract`.
This returns C code that should deallocate whatever `c_extract`
This returns C code that should deallocate whatever `c_extract`
allocated or decrease the reference counts. Do not decrease
allocated or decrease the reference counts. Do not decrease
...
@@ -250,7 +252,7 @@ class PureType(object):
...
@@ -250,7 +252,7 @@ class PureType(object):
# If filter_inplace is defined, it will be called instead of
# If filter_inplace is defined, it will be called instead of
# filter() This is to allow reusing the old allocated memory. As
# filter() This is to allow reusing the old allocated memory. As
# of this writing this is used only when we transfer new data to a
# of this writing this is used only when we transfer new data to a
# shared variable on the gpu.
# shared variable on the gpu.
#def filter_inplace(value, storage, strict=False, allow_downcast=None)
#def filter_inplace(value, storage, strict=False, allow_downcast=None)
...
@@ -470,3 +472,97 @@ class Generic(SingletonType):
...
@@ -470,3 +472,97 @@ class Generic(SingletonType):
return
self
.
__class__
.
__name__
return
self
.
__class__
.
__name__
generic
=
Generic
()
generic
=
Generic
()
class
CDataType
(
Type
):
"""
Represents opaque C data to be passed around. The intent is to
ease passing arbitrary data between ops C code.
"""
def
__init__
(
self
,
ctype
,
freefunc
=
None
):
"""
Build a type made to represent a C pointer in theano.
:param ctype: The type of the pointer (complete with the `*`)
:param freefunc: a function to call to free the pointer. This
function must have a `void` return and take a
single pointer argument.
"""
assert
isinstance
(
ctype
,
basestring
)
self
.
ctype
=
ctype
if
freefunc
is
not
None
:
assert
isinstance
(
freefunc
,
basestring
)
self
.
freefunc
=
freefunc
def
__eq__
(
self
,
other
):
return
(
type
(
self
)
==
type
(
other
)
and
self
.
ctype
==
other
.
ctype
,
self
.
freefunc
==
other
.
freefunc
)
def
__hash__
(
self
):
return
hash
((
type
(
self
),
self
.
ctype
,
self
.
freefunc
))
def
filter
(
self
,
data
,
strict
=
False
,
allow_downcast
=
None
):
if
data
is
not
None
:
raise
TypeError
(
"only None is valid"
)
def
is_valid_value
(
self
,
a
):
return
a
is
None
def
c_declare
(
self
,
name
,
sub
,
check_input
=
True
):
return
"""
%(ctype)
s
%(name)
s;
"""
%
dict
(
ctype
=
self
.
ctype
,
name
=
name
)
def
c_init
(
self
,
name
,
sub
):
return
"
%(name)
s = NULL;"
%
dict
(
name
=
name
)
def
c_extract
(
self
,
name
,
sub
,
check_input
=
True
):
if
PY3
:
s
=
"""
%(name)
s = (
%(ctype)
s)PyCapsule_GetPointer(py_
%(name)
s, NULL);
if (
%(name)
s == NULL)
%(fail)
s
"""
else
:
s
=
"""
%(name)
s = (
%(ctype)
s)PyCObject_AsVoidPtr(py_
%(name)
s);
"""
return
s
%
dict
(
name
=
name
,
ctype
=
self
.
ctype
,
fail
=
sub
[
'fail'
])
def
c_sync
(
self
,
name
,
sub
):
freefunc
=
self
.
freefunc
if
freefunc
is
None
:
freefunc
=
"NULL"
s
=
"""
Py_XDECREF(py_
%(name)
s);
if (
%(name)
s == NULL) {
py_
%(name)
s = Py_None;
Py_INCREF(py_
%(name)
s);
} else """
if
PY3
:
s
+=
"""{
py_
%(name)
s = PyCapsule_New((void *)
%(name)
s, NULL,
(void (*)(void *))
%(freefunc)
s);
}"""
else
:
s
+=
"""{
py_
%(name)
s = PyCObject_FromVoidPtr((void *)
%(name)
s,
(void (*)(void *))
%(freefunc)
s);
}"""
if
self
.
freefunc
is
not
None
:
s
+=
"""
if (py_
%(name)
s == NULL) {
%(freefunc)
s(
%(name)
s); }
"""
return
s
%
dict
(
name
=
name
,
freefunc
=
freefunc
)
def
c_cleanup
(
self
,
name
,
sub
):
# No need to do anything here since the CObject/Capsule will
# free the data for us when released.
return
""
def
c_code_cache_version
(
self
):
return
(
1
,)
def
__str__
(
self
):
return
"
%
s{
%
s}"
%
(
self
.
__class__
.
__name__
,
self
.
ctype
)
theano/sandbox/cuda/dnn.py
浏览文件 @
239b6d80
...
@@ -2,32 +2,53 @@ import copy
...
@@ -2,32 +2,53 @@ import copy
import
os
import
os
import
theano
import
theano
from
theano
import
Apply
from
theano
import
Apply
,
tensor
from
theano
import
tensor
from
theano
.gof.type
import
CDataType
from
theano.compat.six
import
StringIO
from
theano.compat.six
import
StringIO
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda.type
import
CudaNdarrayType
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda
import
GpuOp
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
from
theano.sandbox.cuda.basic_ops
import
(
as_cuda_ndarray_variable
,
gpu_contiguous
)
gpu_contiguous
)
from
theano.sandbox.cuda.blas
import
GpuConv
from
theano.sandbox.cuda.blas
import
GpuConv
from
theano.compat
import
PY3
class
GpuDnnConv
(
GpuOp
):
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
__props__
=
(
'border_mode'
,)
def
__init__
(
self
,
border_mode
):
class
DnnBase
(
GpuOp
):
self
.
border_mode
=
border_mode
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
"""
def
c_headers
(
self
):
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
def
make_node
(
self
,
img
,
kern
):
def
c_header_dirs
(
self
):
if
img
.
type
.
ndim
!=
4
:
return
[
os
.
path
.
dirname
(
__file__
)]
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
raise
TypeError
(
'kern must be 4D tensor'
)
broadcastable
=
(
img
.
type
.
broadcastable
[
0
],
def
c_libraries
(
self
):
kern
.
type
.
broadcastable
[
0
],
return
[
'cudnn'
]
False
,
False
)
def
c_support_code
(
self
):
return
"""
cudnnHandle_t _handle = NULL;
"""
def
c_init_code
(
self
):
if
PY3
:
error_out
=
"NULL"
else
:
error_out
=
""
return
[
"""{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle:
%%
s",
cudnnGetErrorString(err));
return
%
s;
}
}"""
%
(
error_out
,)]
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
class
GpuDnnConvDesc
(
GpuOp
):
__props__
=
(
'border_mode'
,
'subsample'
,
'conv_mode'
)
def
c_headers
(
self
):
def
c_headers
(
self
):
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
return
[
'cudnn.h'
,
'cudnn_helper.h'
]
...
@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp):
...
@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp):
def
c_libraries
(
self
):
def
c_libraries
(
self
):
return
[
'cudnn'
]
return
[
'cudnn'
]
def
c_compiler
(
self
):
return
NVCC_compiler
def
__init__
(
self
,
border_mode
,
subsample
=
(
1
,
1
),
conv_mode
=
'conv'
):
assert
border_mode
in
(
'valid'
,
'full'
)
self
.
border_mode
=
border_mode
assert
len
(
subsample
)
==
2
self
.
subsample
=
subsample
assert
conv_mode
in
(
'conv'
,
'cross'
)
self
.
conv_mode
=
conv_mode
def
make_node
(
self
,
img_shape
,
kern_shape
):
if
img_shape
.
type
.
ndim
!=
1
and
img_shape
.
type
.
dtype
!=
numpy
.
int64
:
raise
TypeError
(
'img must be 1D shape tensor'
)
if
kern_shape
.
type
.
ndim
!=
1
and
kern_shape
.
type
.
dtype
!=
numpy
.
int64
:
raise
TypeError
(
'kern must be 1D shape tensor'
)
return
Apply
(
self
,
[
img_shape
,
kern_shape
],
[
CDataType
(
"cudnnConvolutionDescriptor_t"
)()])
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
img_shape
,
kern_shape
=
inputs
desc
,
=
outputs
if
self
.
border_mode
==
"valid"
:
bmode
=
1
else
:
assert
self
.
border_mode
==
"full"
bmode
=
0
if
self
.
conv_mode
==
'conv'
:
conv_flag
=
'CUDNN_CONVOLUTION'
else
:
conv_flag
=
'CUDNN_CROSS_CORRELATION'
return
"""
{
cudnnStatus_t err;
int pad_h
%(name)
s;
int pad_w
%(name)
s;
if ((err = cudnnCreateConvolutionDescriptor(&
%(desc)
s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor:
%%
s", cudnnGetErrorString(err));
%(fail)
s
}
if (
%(bmode)
d == 1) {
pad_h
%(name)
s = 0;
pad_w
%(name)
s = 0;
} else if (
%(bmode)
d == 0) {
pad_h
%(name)
s = *(npy_int64 *)PyArray_GETPTR1(
%(kern_shape)
s, 2) - 1;
pad_w
%(name)
s = *(npy_int64 *)PyArray_GETPTR1(
%(kern_shape)
s, 3) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)
s
}
err = cudnnSetConvolutionDescriptorEx(
%(desc)
s,
*(npy_int64 *)PyArray_GETPTR1(
%(img_shape)
s, 0),
*(npy_int64 *)PyArray_GETPTR1(
%(img_shape)
s, 1),
*(npy_int64 *)PyArray_GETPTR1(
%(img_shape)
s, 2),
*(npy_int64 *)PyArray_GETPTR1(
%(img_shape)
s, 3),
*(npy_int64 *)PyArray_GETPTR1(
%(kern_shape)
s, 0),
*(npy_int64 *)PyArray_GETPTR1(
%(kern_shape)
s, 2),
*(npy_int64 *)PyArray_GETPTR1(
%(kern_shape)
s, 3),
pad_h
%(name)
s,
pad_w
%(name)
s,
%(subsx)
d,
%(subsy)
d, 1, 1,
%(conv_flag)
s
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor:
%%
s",
cudnnGetErrorString(err));
%(fail)
s
}
}
"""
%
dict
(
name
=
name
,
img_shape
=
img_shape
,
kern_shape
=
kern_shape
,
desc
=
desc
,
bmode
=
bmode
,
conv_flag
=
conv_flag
,
fail
=
sub
[
'fail'
],
subsx
=
self
.
subsample
[
0
],
subsy
=
self
.
subsample
[
1
])
def
c_code_cache_version
(
self
):
return
(
1
,)
class
GpuDnnConvBase
(
DnnBase
):
__props__
=
()
def
make_node
(
self
,
img
,
kern
,
desc
):
if
img
.
type
.
ndim
!=
4
:
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
raise
TypeError
(
'kern must be 4D tensor'
)
broadcastable
=
(
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
False
,
False
)
return
Apply
(
self
,
[
img
,
kern
,
desc
],
[
CudaNdarrayType
(
broadcastable
)()])
def
c_support_code_struct
(
self
,
node
,
struct_id
):
def
c_support_code_struct
(
self
,
node
,
struct_id
):
return
"""
return
"""
cudnnHandle_t handle
%(id)
d;
cudnnTensor4dDescriptor_t input
%(id)
d;
cudnnTensor4dDescriptor_t input
%(id)
d;
cudnnTensor4dDescriptor_t output
%(id)
d;
cudnnTensor4dDescriptor_t output
%(id)
d;
cudnnFilterDescriptor_t kerns
%(id)
d;
cudnnFilterDescriptor_t kerns
%(id)
d;
cudnnConvolutionDescriptor_t op
%(id)
d;
"""
%
dict
(
id
=
struct_id
)
"""
%
dict
(
id
=
struct_id
)
def
c_init_code_struct
(
self
,
node
,
struct_id
,
sub
):
def
c_init_code_struct
(
self
,
node
,
struct_id
,
sub
):
return
"""
return
"""
handle
%(id)
d = NULL
;
cudnnStatus_t err
%(id)
d
;
input
%(id)
d = NULL;
input
%(id)
d = NULL;
output
%(id)
d = NULL;
output
%(id)
d = NULL;
kerns
%(id)
d = NULL;
kerns
%(id)
d = NULL;
op
%(id)
d = NULL;
cudnnStatus_t err
%(id)
d;
if ((err
%(id)
d = cudnnCreate(&handle
%(id)
d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle:
%%
s",
cudnnGetErrorString(err
%(id)
d));
%(fail)
s
}
if ((err
%(id)
d = cudnnCreateTensor4dDescriptor(&input
%(id)
d)) != CUDNN_STATUS_SUCCESS) {
if ((err
%(id)
d = cudnnCreateTensor4dDescriptor(&input
%(id)
d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp):
%%
s", cudnnGetErrorString(err
%(id)
d));
"(inp):
%%
s", cudnnGetErrorString(err
%(id)
d));
...
@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC
...
@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC
cudnnGetErrorString(err
%(id)
d));
cudnnGetErrorString(err
%(id)
d));
%(fail)
s
%(fail)
s
}
}
if ((err
%(id)
d = cudnnCreateConvolutionDescriptor(&op
%(id)
d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor:
%%
s", cudnnGetErrorString(err
%(id)
d));
%(fail)
s
}
"""
%
dict
(
id
=
struct_id
,
fail
=
sub
[
'fail'
])
"""
%
dict
(
id
=
struct_id
,
fail
=
sub
[
'fail'
])
def
c_cleanup_code_struct
(
self
,
node
,
struct_id
):
def
c_cleanup_code_struct
(
self
,
node
,
struct_id
):
...
@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU
...
@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU
cudnnDestroyTensor4dDescriptor(input
%(id)
d);
cudnnDestroyTensor4dDescriptor(input
%(id)
d);
cudnnDestroyTensor4dDescriptor(output
%(id)
d);
cudnnDestroyTensor4dDescriptor(output
%(id)
d);
cudnnDestroyFilterDescriptor(kerns
%(id)
d);
cudnnDestroyFilterDescriptor(kerns
%(id)
d);
cudnnDestroyConvolutionDescriptor(op
%(id)
d);
cudnnDestroy(handle
%(id)
d);
"""
%
dict
(
id
=
struct_id
)
"""
%
dict
(
id
=
struct_id
)
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
def
c_set_tensor4d
(
self
,
var
,
desc
,
err
,
fail
):
img
,
kern
=
inputs
out
,
=
outputs
if
self
.
border_mode
==
"valid"
:
bmode
=
1
else
:
assert
self
.
border_mode
==
"full"
bmode
=
0
return
"""
return
"""
cudnnStatus_t err
%(name)
s;
%(err)
s = cudnnSetTensor4dDescriptorEx(
int pad_w
%(name)
s;
%(desc)
s, CUDNN_DATA_FLOAT,
int pad_h
%(name)
s;
CudaNdarray_HOST_DIMS(
%(var)
s)[0],
CudaNdarray_HOST_DIMS(
%(var)
s)[1],
if (!CudaNdarray_is_c_contiguous(
%(img)
s)) {
CudaNdarray_HOST_DIMS(
%(var)
s)[2],
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
CudaNdarray_HOST_DIMS(
%(var)
s)[3],
%(fail)
s
CudaNdarray_HOST_STRIDES(
%(var)
s)[0]?CudaNdarray_HOST_STRIDES(
%(var)
s)[0]:CudaNdarray_HOST_DIMS(
%(var)
s)[2]*CudaNdarray_HOST_DIMS(
%(var)
s)[3]*CudaNdarray_HOST_DIMS(
%(var)
s)[1],
}
CudaNdarray_HOST_STRIDES(
%(var)
s)[1]?CudaNdarray_HOST_STRIDES(
%(var)
s)[1]:CudaNdarray_HOST_DIMS(
%(var)
s)[2]*CudaNdarray_HOST_DIMS(
%(var)
s)[3],
CudaNdarray_HOST_STRIDES(
%(var)
s)[2]?CudaNdarray_HOST_STRIDES(
%(var)
s)[2]:CudaNdarray_HOST_DIMS(
%(var)
s)[3],
if (!CudaNdarray_is_c_contiguous(
%(kerns)
s)) {
CudaNdarray_HOST_STRIDES(
%(var)
s)[3]?CudaNdarray_HOST_STRIDES(
%(var)
s)[3]:1
PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
%(fail)
s
}
err
%(name)
s = cudnnSetTensor4dDescriptorEx(
input
%(id)
d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(
%(img)
s)[0],
CudaNdarray_HOST_DIMS(
%(img)
s)[1],
CudaNdarray_HOST_DIMS(
%(img)
s)[2],
CudaNdarray_HOST_DIMS(
%(img)
s)[3],
CudaNdarray_HOST_STRIDES(
%(img)
s)[0],
CudaNdarray_HOST_STRIDES(
%(img)
s)[1],
CudaNdarray_HOST_STRIDES(
%(img)
s)[2],
CudaNdarray_HOST_STRIDES(
%(img)
s)[3]
);
);
if (
err
%(name
)
s != CUDNN_STATUS_SUCCESS) {
if (
%(err
)
s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor:
%%
s",
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor:
%%
s",
cudnnGetErrorString(
err
%(name
)
s));
cudnnGetErrorString(
%(err
)
s));
%(fail)
s
%(fail)
s
}
}
err
%(name)
s = cudnnSetFilterDescriptor(
"""
%
dict
(
var
=
var
,
err
=
err
,
desc
=
desc
,
fail
=
fail
)
kerns
%(id)
d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(
%(kerns)
s)[0],
def
c_set_filter
(
self
,
var
,
desc
,
err
,
fail
):
CudaNdarray_HOST_DIMS(
%(kerns)
s)[1],
return
"""
CudaNdarray_HOST_DIMS(
%(kerns)
s)[2],
%(err)
s = cudnnSetFilterDescriptor(
CudaNdarray_HOST_DIMS(
%(kerns)
s)[3]
%(desc)
s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(
%(var)
s)[0],
CudaNdarray_HOST_DIMS(
%(var)
s)[1],
CudaNdarray_HOST_DIMS(
%(var)
s)[2],
CudaNdarray_HOST_DIMS(
%(var)
s)[3]
);
);
if (
err
%(name
)
s != CUDNN_STATUS_SUCCESS) {
if (
%(err
)
s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor:
%%
s",
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor:
%%
s",
cudnnGetErrorString(
err
%(name
)
s));
cudnnGetErrorString(
%(err
)
s));
%(fail)
s
%(fail)
s
}
}
if (
%(bmode)
d == 1) {
"""
%
dict
(
var
=
var
,
desc
=
desc
,
err
=
err
,
fail
=
fail
)
pad_h
%(name)
s = 0;
pad_w
%(name)
s = 0;
def
c_code
(
self
,
node
,
name
,
inputs
,
outputs
,
sub
):
} else if (
%(bmode)
d == 0) {
desc
=
inputs
[
2
]
pad_h
%(name)
s = CudaNdarray_HOST_DIMS(
%(kerns)
s)[2] - 1;
out
,
=
outputs
pad_w
%(name)
s = CudaNdarray_HOST_DIMS(
%(kerns)
s)[3] - 1;
} else {
checks
=
[]
PyErr_SetString(PyExc_ValueError, "bad border mode");
for
v
in
inputs
[:
2
]:
%(fail)
s
checks
.
append
(
"""
}
if (!CudaNdarray_is_c_contiguous(
%
s)) {
err
%(name)
s = cudnnSetConvolutionDescriptor(
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
op
%(id)
d, input
%(id)
d, kerns
%(id)
d,
%
s
pad_h
%(name)
s,
pad_w
%(name)
s,
1, 1, 1, 1,
CUDNN_CONVOLUTION
);
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor:
%%
s",
cudnnGetErrorString(err
%(name)
s));
%(fail)
s
}
}
"""
%
(
v
,
sub
[
'fail'
]))
sets
=
[]
for
p
,
v
,
d
in
zip
(
inputs
[:
2
],
self
.
conv_inputs
,
self
.
conv_types
[:
2
]):
sets
.
append
(
getattr
(
self
,
'c_set_'
+
d
)(
p
,
v
+
str
(
sub
[
'struct_id'
]),
'err'
+
name
,
sub
[
'fail'
]))
set_out
=
getattr
(
self
,
'c_set_'
+
self
.
conv_types
[
2
])(
out
,
self
.
conv_output
+
str
(
sub
[
'struct_id'
]),
'err'
+
name
,
sub
[
'fail'
])
return
"""
cudnnStatus_t err
%(name)
s;
%(checks)
s
%(sets)
s
{
{
int out_dims[4];
int out_dims[4];
err
%(name)
s = cudnnGetOutputTensor4dDim(
err
%(name)
s = cudnnGetOutputTensor4dDim(
op
%(id)
d, CUDNN_CONVOLUTION_FWD,
%(desc)
s,
%(path)
s,
&out_dims[0], &out_dims[1],
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
&out_dims[2], &out_dims[3]
);
);
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor:
%%
s",
PyErr_Format(PyExc_RuntimeError, "could not get output sizes:
%%
s",
cudnnGetErrorString(err
%(name)
s));
cudnnGetErrorString(err
%(name)
s));
%(fail)
s
%(fail)
s
}
}
if (CudaNdarray_prep_output(&
%(out)
s, 4, out_dims) != 0) {
// workaround for cudnn R1 bug
%(fail)
s
if (
%(path)
s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
(out_dims[0] != CudaNdarray_HOST_DIMS(
%(input2)
s)[1] ||
out_dims[1] != CudaNdarray_HOST_DIMS(
%(input1)
s)[1])) {
out_dims[0] = CudaNdarray_HOST_DIMS(
%(input2)
s)[1];
out_dims[1] = CudaNdarray_HOST_DIMS(
%(input1)
s)[1];
// This is a horrible hack that is unfortulately necessary
int *dd = (int *)
%(desc)
s;
out_dims[2] = dd[5];
out_dims[3] = dd[6];
}
if (CudaNdarray_prep_output(&
%(out)
s, 4, out_dims) != 0) {
%(fail)
s
}
}
}
}
err
%(name)
s = cudnnSetTensor4dDescriptorEx(
%(set_out)
s
output
%(id)
d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(
%(out)
s)[0],
err
%(name)
s =
%(method)
s(
CudaNdarray_HOST_DIMS(
%(out)
s)[1],
_handle,
CudaNdarray_HOST_DIMS(
%(out)
s)[2],
%(input1_desc)
s, CudaNdarray_DEV_DATA(
%(input1)
s),
CudaNdarray_HOST_DIMS(
%(out)
s)[3],
%(input2_desc)
s, CudaNdarray_DEV_DATA(
%(input2)
s),
CudaNdarray_HOST_STRIDES(
%(out)
s)[0],
%(desc)
s,
CudaNdarray_HOST_STRIDES(
%(out)
s)[1],
%(output_desc)
s, CudaNdarray_DEV_DATA(
%(out)
s),
CudaNdarray_HOST_STRIDES(
%(out)
s)[2],
CudaNdarray_HOST_STRIDES(
%(out)
s)[3]
);
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor:
%%
s",
cudnnGetErrorString(err
%(name)
s));
%(fail)
s
}
err
%(name)
s = cudnnConvolutionForward(
handle
%(id)
d,
input
%(id)
d, CudaNdarray_DEV_DATA(
%(img)
s),
kerns
%(id)
d, CudaNdarray_DEV_DATA(
%(kerns)
s),
op
%(id)
d,
output
%(id)
d, CudaNdarray_DEV_DATA(
%(out)
s),
CUDNN_RESULT_NO_ACCUMULATE
CUDNN_RESULT_NO_ACCUMULATE
);
);
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
if (err
%(name)
s != CUDNN_STATUS_SUCCESS) {
...
@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
...
@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err
%(name)
s));
cudnnGetErrorString(err
%(name)
s));
%(fail)
s
%(fail)
s
}
}
"""
%
dict
(
img
=
img
,
kerns
=
kern
,
out
=
out
,
bmode
=
bmode
,
"""
%
dict
(
out
=
out
,
desc
=
desc
,
fail
=
sub
[
'fail'
],
id
=
sub
[
'struct_id'
],
fail
=
sub
[
'fail'
],
id
=
sub
[
'struct_id'
],
name
=
name
)
name
=
name
,
checks
=
'
\n
'
.
join
(
checks
),
sets
=
'
\n
'
.
join
(
sets
),
set_out
=
set_out
,
input1
=
inputs
[
0
],
input2
=
inputs
[
1
],
input1_desc
=
self
.
conv_inputs
[
0
]
+
str
(
sub
[
'struct_id'
]),
input2_desc
=
self
.
conv_inputs
[
1
]
+
str
(
sub
[
'struct_id'
]),
output_desc
=
self
.
conv_output
+
str
(
sub
[
'struct_id'
]),
method
=
self
.
conv_op
,
path
=
self
.
path_flag
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
4
,)
return
(
7
,)
class
GpuDnnConv
(
GpuDnnConvBase
):
conv_inputs
=
'input'
,
'kerns'
conv_output
=
'output'
conv_types
=
'tensor4d'
,
'filter'
,
'tensor4d'
conv_op
=
'cudnnConvolutionForward'
path_flag
=
'CUDNN_CONVOLUTION_FWD'
def
grad
(
self
,
inp
,
grads
):
img
,
kerns
,
desc
=
inp
top
,
=
grads
top
=
gpu_contiguous
(
top
)
d_img
=
GpuDnnConvGradI
()(
kerns
,
top
,
desc
)
d_kerns
=
GpuDnnConvGradW
()(
img
,
top
,
desc
)
return
d_img
,
d_kerns
,
theano
.
gradient
.
DisconnectedType
()()
def
connection_pattern
(
self
,
node
):
# not connected to desc
return
[[
1
],
[
1
],
[
0
]]
class
GpuDnnConvGradW
(
GpuDnnConvBase
):
conv_inputs
=
'input'
,
'output'
,
conv_output
=
'kerns'
conv_types
=
'tensor4d'
,
'tensor4d'
,
'filter'
path_flag
=
'CUDNN_CONVOLUTION_WEIGHT_GRAD'
conv_op
=
'cudnnConvolutionBackwardFilter'
class
GpuDnnConvGradI
(
GpuDnnConvBase
):
conv_inputs
=
'kerns'
,
'output'
,
conv_output
=
'input'
conv_types
=
'filter'
,
'tensor4d'
,
'tensor4d'
path_flag
=
'CUDNN_CONVOLUTION_DATA_GRAD'
conv_op
=
'cudnnConvolutionBackwardData'
from
theano.sandbox.cuda.opt
import
(
local_optimizer
,
gpu_contiguous
,
from
theano.sandbox.cuda.opt
import
(
local_optimizer
,
gpu_contiguous
,
gpu_optimizer
)
gpu_optimizer
)
def
dnn_conv
(
img
,
kerns
,
border_mode
=
'valid'
,
subsample
=
(
1
,
1
),
conv_mode
=
'conv'
):
img
=
gpu_contiguous
(
img
)
kerns
=
gpu_contiguous
(
kerns
)
desc
=
GpuDnnConvDesc
(
border_mode
=
border_mode
,
subsample
=
subsample
,
conv_mode
=
conv_mode
)(
img
.
shape
,
kerns
.
shape
)
return
GpuDnnConv
()(
img
,
kerns
,
desc
)
@local_optimizer
([
GpuConv
])
@local_optimizer
([
GpuConv
])
def
local_conv_dnn
(
node
):
def
local_conv_dnn
(
node
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
isinstance
(
node
.
op
,
GpuConv
):
if
(
node
.
op
.
subsample
!=
(
1
,
1
)
or
if
node
.
op
.
border_mode
not
in
[
'full'
,
'valid'
]:
node
.
op
.
border_mode
not
in
[
'full'
,
'valid'
]):
return
return
img
,
kern
=
node
.
inputs
img
,
kern
=
node
.
inputs
border_mode
=
node
.
op
.
border_mode
border_mode
=
node
.
op
.
border_mode
return
[
GpuDnnConv
(
border_mode
)(
gpu_contiguous
(
img
),
subsample
=
node
.
op
.
subsample
gpu_contiguous
(
kern
))]
return
[
dnn_conv
(
gpu_contiguous
(
img
),
gpu_contiguous
(
kern
),
border_mode
=
border_mode
,
subsample
=
subsample
)]
gpu_optimizer
.
register
(
"conv_cudnn"
,
local_conv_dnn
,
'cudnn'
)
gpu_optimizer
.
register
(
"conv_cudnn"
,
local_conv_dnn
,
'cudnn'
)
theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
浏览文件 @
239b6d80
...
@@ -4,7 +4,7 @@ Tests for GPU convolution
...
@@ -4,7 +4,7 @@ Tests for GPU convolution
import
sys
import
sys
import
time
import
time
import
unittest
import
unittest
import
traceback
import
numpy
import
numpy
...
@@ -19,14 +19,14 @@ except ImportError:
...
@@ -19,14 +19,14 @@ except ImportError:
import
theano
import
theano
from
theano
import
tensor
from
theano
import
tensor
from
theano.gof.python25
import
any
from
theano.gof.python25
import
any
from
theano.tests.unittest_tools
import
seed_rng
from
theano.tests.unittest_tools
import
seed_rng
,
assert_allclose
# Skip test if cuda is not available.
# Skip test if cuda is not available.
from
theano.sandbox
import
cuda
from
theano.sandbox
import
cuda
if
cuda
.
cuda_available
==
False
:
if
cuda
.
cuda_available
==
False
:
raise
SkipTest
(
'Optional package cuda disabled'
)
raise
SkipTest
(
'Optional package cuda disabled'
)
from
theano.sandbox.cuda.dnn
import
GpuDnnConv
from
theano.sandbox.cuda.dnn
import
GpuDnnConv
,
GpuDnnConvBase
,
dnn_conv
#needed as the gpu conv don't have a perform implementation.
#needed as the gpu conv don't have a perform implementation.
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
if
theano
.
config
.
mode
==
'FAST_COMPILE'
:
...
@@ -48,7 +48,7 @@ if device_id is None:
...
@@ -48,7 +48,7 @@ if device_id is None:
enable_cuda
=
False
,
enable_cuda
=
False
,
test_driver
=
True
)
test_driver
=
True
)
device_id
=
theano
.
sandbox
.
cuda
.
use
.
device_number
device_id
=
theano
.
sandbox
.
cuda
.
use
.
device_number
cuda_ndarray
=
theano
.
sandbox
.
cuda
.
cuda_ndarray
.
cuda_ndarray
cuda_ndarray
=
theano
.
sandbox
.
cuda
.
cuda_ndarray
.
cuda_ndarray
device_prop
=
cuda_ndarray
.
device_properties
(
device_id
)
device_prop
=
cuda_ndarray
.
device_properties
(
device_id
)
...
@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
npy_kern
=
npy_kern
[:,
:,
::
kern_stride
[
0
],
::
kern_stride
[
1
]]
npy_kern
=
npy_kern
[:,
:,
::
kern_stride
[
0
],
::
kern_stride
[
1
]]
t2
=
None
t2
=
None
rval
=
True
try
:
t0
=
time
.
time
()
t0
=
time
.
time
()
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
mode
,
subsample
)
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
mode
,
subsample
)
t1
=
time
.
time
()
t1
=
time
.
time
()
i
=
cuda_tensor4
()
i
=
cuda_tensor4
()
k
=
cuda_tensor4
()
k
=
cuda_tensor4
()
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
(
border_mode
=
mode
,
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
(
border_mode
=
mode
,
subsample
=
subsample
,
subsample
=
subsample
,
version
=
version
,
version
=
version
,
verbose
=
verbose
,
verbose
=
verbose
,
kshp
=
compile_kshp
)(
i
,
k
)
kshp
=
compile_kshp
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
if
cls
is
not
None
:
if
cls
is
not
None
:
assert
any
([
isinstance
(
node
.
op
,
cls
)
assert
any
([
isinstance
(
node
.
op
,
cls
)
for
node
in
f
.
maker
.
fgraph
.
toposort
()]),
"Cannot find class
%
r in
%
r"
%
(
cls
,
f
.
maker
.
fgraph
.
toposort
())
for
node
in
f
.
maker
.
fgraph
.
toposort
()]),
"Cannot find class
%
r in
%
r"
%
(
cls
,
f
.
maker
.
fgraph
.
toposort
())
gpuval
=
f
(
img
,
kern
)
gpuval
=
f
(
img
,
kern
)
t2
=
time
.
time
()
t2
=
time
.
time
()
for
i
in
range
(
nb_iter
):
for
i
in
range
(
nb_iter
):
gpuval2
=
f
(
img
,
kern
)
gpuval2
=
f
(
img
,
kern
)
assert
(
numpy
.
asarray
(
gpuval
)
==
numpy
.
asarray
(
gpuval2
))
.
all
()
assert
numpy
.
allclose
(
numpy
.
asarray
(
gpuval
),
gpuval
=
numpy
.
asarray
(
gpuval
)
numpy
.
asarray
(
gpuval2
))
assert
gpuval
.
shape
==
cpuval
.
shape
,
(
"shape mismatch"
,
gpuval
.
shape
,
cpuval
.
shape
)
assert
(
numpy
.
asarray
(
gpuval
)
==
numpy
.
asarray
(
gpuval2
))
.
all
()
assert_allclose
(
cpuval
,
gpuval
,
rtol
=
rtol
,
atol
=
atol
)
gpuval
=
numpy
.
asarray
(
gpuval
)
assert
numpy
.
all
(
numpy
.
isfinite
(
gpuval
)),
gpuval
if
gpuval
.
shape
!=
cpuval
.
shape
:
print
>>
sys
.
stdout
,
"ERROR: shape mismatch"
,
print
>>
sys
.
stdout
,
gpuval
.
shape
,
cpuval
.
shape
rval
=
False
if
rval
:
rval
=
numpy
.
allclose
(
cpuval
,
gpuval
,
rtol
=
rtol
)
assert
numpy
.
all
(
numpy
.
isfinite
(
gpuval
)),
gpuval
except
NotImplementedError
,
e
:
print
>>
sys
.
stdout
,
'_params_allgood Failed allclose'
,
e
rval
=
False
if
(
t2
is
not
None
):
if
(
t2
is
not
None
):
if
mode
==
'valid'
:
if
mode
==
'valid'
:
...
@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
...
@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
print
>>
sys
.
stdout
,
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
print
>>
sys
.
stdout
,
'
%15
s'
%
str
(
ishape
),
'
%15
s'
%
str
(
kshape
),
print
>>
sys
.
stdout
,
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
print
>>
sys
.
stdout
,
'
%12.5
f
%7.2
f
%7.2
f
%7.1
f'
%
(
approx_fp
,
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
))
cpu_mflops
,
gpu_mflops
,
(
t1
-
t0
)
/
(
t2
-
t1
))
if
not
rval
:
print
>>
sys
.
stdout
,
(
'test_'
+
mode
+
' id='
+
str
(
id
)
+
' FAILED for ishape, kshape, mode, subsample,'
+
' img_stride, kern_stride, version'
,
ishape
,
kshape
,
mode
,
subsample
,
img_stride
,
kern_stride
,
version
)
diff
=
cpuval
-
gpuval
diffabs
=
numpy
.
absolute
(
diff
)
pr_diff
=
diffabs
/
numpy
.
absolute
(
cpuval
)
nb_close
=
(
diffabs
<=
(
atol
+
rtol
*
numpy
.
absolute
(
gpuval
)))
.
sum
()
print
"max absolute diff:"
,
(
diffabs
.
max
(),
"avg abs diff:"
,
numpy
.
average
(
diffabs
))
print
"median abs diff:"
,
(
numpy
.
median
(
diffabs
),
"nb close:"
,
nb_close
,
"/"
,
diff
.
size
)
print
"max relatif diff:"
,
(
pr_diff
.
max
(),
"avg rel diff:"
,
numpy
.
average
(
pr_diff
))
if
not
rval
and
print_
!=
False
:
if
npy_img
.
shape
[
0
]
>
5
:
print
"img"
,
npy_img
[
0
]
print
"kern"
,
npy_kern
[
0
]
print
"gpu"
,
gpuval
[
0
][
0
]
print
"cpu"
,
cpuval
[
0
][
0
]
print
"diff"
,
diff
[
0
][
0
]
else
:
print
"img"
,
npy_img
print
"kern"
,
npy_kern
print
"gpu"
,
gpuval
print
"cpu"
,
cpuval
print
"diff"
,
diff
return
rval
def
exec_conv
(
version
,
shapes
,
verbose
,
random
,
mode
,
def
exec_conv
(
version
,
shapes
,
verbose
,
random
,
mode
,
...
@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode,
...
@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode,
theano_mode
=
theano_mode
,
cls
=
None
):
theano_mode
=
theano_mode
,
cls
=
None
):
if
verbose
>
0
:
if
verbose
>
0
:
_params_allgood_header
()
_params_allgood_header
()
nb_failed
=
0
nb_tests
=
0
failed_version
=
set
()
failed_id
=
[]
for
ver
in
version
:
for
ver
in
version
:
for
id
,
(
ishape
,
kshape
,
subshape
,
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
istride
,
kstride
)
in
enumerate
(
shapes
):
ret
=
False
yield
(
_params_allgood
,
ishape
,
kshape
,
mode
,
subshape
,
try
:
istride
,
kstride
,
ver
,
verbose
,
random
,
print_
,
id
,
ret
=
_params_allgood
(
ishape
,
rtol
,
1e-8
,
0
,
ones
,
None
,
theano_mode
,
cls
)
kshape
,
mode
,
subsample
=
subshape
,
img_stride
=
istride
,
kern_stride
=
kstride
,
version
=
ver
,
verbose
=
verbose
,
random
=
random
,
id
=
id
,
print_
=
print_
,
rtol
=
rtol
,
ones
=
ones
,
theano_mode
=
theano_mode
,
cls
=
cls
)
except
Exception
,
e
:
print
ver
,
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
print
"Exception"
,
type
(
e
),
e
pass
if
not
ret
:
failed_version
.
add
(
ver
)
failed_id
.
append
(
id
)
nb_failed
+=
1
nb_tests
+=
1
if
nb_failed
>
0
:
print
"nb_failed"
,
nb_failed
,
"on"
,
nb_tests
,
print
"failed_version"
,
failed_version
,
"failed_id"
,
failed_id
assert
nb_failed
==
0
,
nb_failed
else
:
print
'Executed'
,
nb_tests
,
'different shapes'
def
get_basic_shapes
():
def
get_basic_shapes
():
...
@@ -453,8 +379,9 @@ def test_valid_0_2():
...
@@ -453,8 +379,9 @@ def test_valid_0_2():
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
test_valid_1_3_11_12
():
def
test_valid_1_3_11_12
():
...
@@ -483,8 +410,9 @@ def test_valid_1_3_11_12():
...
@@ -483,8 +410,9 @@ def test_valid_1_3_11_12():
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
test_valid_4
():
def
test_valid_4
():
...
@@ -515,8 +443,9 @@ def test_valid_4():
...
@@ -515,8 +443,9 @@ def test_valid_4():
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
test_valid_5
():
def
test_valid_5
():
...
@@ -532,7 +461,6 @@ def test_valid_5():
...
@@ -532,7 +461,6 @@ def test_valid_5():
random
=
False
random
=
False
shapes2
=
[]
shapes2
=
[]
# print len(shapes)
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
numpy
.
asarray
(
kshape
[
2
:])
+
numpy
.
asarray
(
kshape
[
2
:])
+
...
@@ -545,10 +473,10 @@ def test_valid_5():
...
@@ -545,10 +473,10 @@ def test_valid_5():
if
subshape
==
(
1
,
1
):
if
subshape
==
(
1
,
1
):
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
# print len(shapes2)
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
test_valid_7_8_13
():
def
test_valid_7_8_13
():
...
@@ -567,7 +495,6 @@ def test_valid_7_8_13():
...
@@ -567,7 +495,6 @@ def test_valid_7_8_13():
random
=
False
random
=
False
shapes2
=
[]
shapes2
=
[]
# print len(shapes)
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
numpy
.
asarray
(
kshape
[
2
:])
+
numpy
.
asarray
(
kshape
[
2
:])
+
...
@@ -580,10 +507,10 @@ def test_valid_7_8_13():
...
@@ -580,10 +507,10 @@ def test_valid_7_8_13():
if
subshape
==
(
1
,
1
):
if
subshape
==
(
1
,
1
):
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
# print len(shapes2)
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
test_valid_9_10
():
def
test_valid_9_10
():
...
@@ -599,7 +526,6 @@ def test_valid_9_10():
...
@@ -599,7 +526,6 @@ def test_valid_9_10():
random
=
False
random
=
False
shapes2
=
[]
shapes2
=
[]
# print len(shapes)
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
for
id
,
(
ishape
,
kshape
,
subshape
,
istride
,
kstride
)
in
enumerate
(
shapes
):
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
oshape
=
[
ishape
[
0
]]
+
[
kshape
[
0
]]
+
list
(
numpy
.
asarray
(
ishape
[
2
:])
-
numpy
.
asarray
(
kshape
[
2
:])
+
numpy
.
asarray
(
kshape
[
2
:])
+
...
@@ -611,19 +537,16 @@ def test_valid_9_10():
...
@@ -611,19 +537,16 @@ def test_valid_9_10():
if
subshape
==
(
1
,
1
):
if
subshape
==
(
1
,
1
):
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes2
.
append
((
ishape
,
kshape
,
subshape
,
istride
,
kstride
))
shapes
=
shapes2
shapes
=
shapes2
# print len(shapes2)
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
)
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
):
yield
t
def
_test_valid
(
cls
,
mode
=
None
,
extra_shapes
=
[],
version
=
[
-
1
]):
def
_test_valid
(
cls
,
mode
=
None
,
extra_shapes
=
[],
version
=
[
-
1
]):
seed_rng
()
seed_rng
()
shapes
=
get_valid_shapes
()
shapes
=
get_valid_shapes
()
#shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
verbose
=
0
verbose
=
0
random
=
True
random
=
True
...
@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
...
@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
shapes
+=
extra_shapes
shapes
+=
extra_shapes
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
return
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
,
print_
=
print_
,
ones
=
ones
,
rtol
=
1.1e-5
,
theano_mode
=
mode
,
cls
=
cls
)
theano_mode
=
mode
,
cls
=
cls
)
def
test_valid
():
def
test_valid
():
_test_valid
(
None
,
version
=
[
-
2
,
-
1
,
6
])
for
t
in
_test_valid
(
None
,
version
=
[
-
2
,
-
1
,
6
]):
yield
t
def
test_gemm_valid
():
def
test_gemm_valid
():
extra_shapes
=
get_shapes2
(
scales_img
=
(
2
,
2
),
img_stride
=
(
2
,
2
))
extra_shapes
=
get_shapes2
(
scales_img
=
(
2
,
2
),
img_stride
=
(
2
,
2
))
extra_shapes
+=
get_shapes2
(
scales_kern
=
(
2
,
2
),
kern_stride
=
(
2
,
2
))
extra_shapes
+=
get_shapes2
(
scales_kern
=
(
2
,
2
),
kern_stride
=
(
2
,
2
))
_test_valid
(
cuda
.
blas
.
BaseGpuCorrMM
,
for
t
in
_test_valid
(
cuda
.
blas
.
BaseGpuCorrMM
,
mode
=
theano_mode
.
including
(
"conv_gemm"
),
mode
=
theano_mode
.
including
(
"conv_gemm"
),
extra_shapes
=
extra_shapes
)
extra_shapes
=
extra_shapes
):
yield
t
def
test_dnn_valid
():
def
test_dnn_valid
():
_test_valid
(
GpuDnnConv
,
mode
=
theano_mode
.
including
(
"cudnn"
))
for
t
in
_test_valid
(
GpuDnnConv
,
mode
=
theano_mode
.
including
(
"cudnn"
)):
yield
t
def
_test_full
(
cls
,
mode
=
None
,
version
=
[
-
1
],
extra_shapes
=
[]):
def
_test_full
(
cls
,
mode
=
None
,
version
=
[
-
1
],
extra_shapes
=
[]):
...
@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
...
@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
,
((
1
,
1
,
44800
,
1
),
(
6
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
#This caused crash
,
((
1
,
1
,
44800
,
1
),
(
6
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
#This caused crash
]
]
# shapes=shapes[:277]
verbose
=
0
verbose
=
0
random
=
True
random
=
True
shapes
+=
extra_shapes
shapes
+=
extra_shapes
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'full'
,
return
exec_conv
(
version
,
shapes
,
verbose
,
random
,
'full'
,
theano_mode
=
mode
,
cls
=
cls
)
theano_mode
=
mode
,
cls
=
cls
)
def
test_full
():
def
test_full
():
_test_full
(
None
,
version
=
[
-
2
,
-
1
,
0
,
1
,
2
,
3
,
4
,
5
])
for
t
in
_test_full
(
None
,
version
=
[
-
2
,
-
1
,
0
,
1
,
2
,
3
,
4
,
5
]):
yield
t
def
test_gemm_full
():
def
test_gemm_full
():
_test_full
(
cuda
.
blas
.
BaseGpuCorrMM
,
for
t
in
_test_full
(
cuda
.
blas
.
BaseGpuCorrMM
,
mode
=
theano_mode
.
including
(
"conv_gemm"
))
mode
=
theano_mode
.
including
(
"conv_gemm"
)):
yield
t
def
test_dnn_full
():
def
test_dnn_full
():
_test_full
(
GpuDnnConv
,
mode
=
theano_mode
.
including
(
"cudnn"
))
for
t
in
_test_full
(
GpuDnnConv
,
mode
=
theano_mode
.
including
(
"cudnn"
)):
yield
t
def
test_subsample
(
conv_gemm
=
False
):
def
_test_subsample
(
cls
,
mode
,
version_valid
=
[
-
1
],
version_full
=
[
-
1
]
):
seed_rng
()
seed_rng
()
shapes
=
[((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
shapes
=
[((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
2
,
2
),
(
1
,
1
),
(
1
,
1
)),
((
1
,
1
,
1
,
1
),
(
1
,
1
,
1
,
1
),
(
2
,
2
),
(
1
,
1
),
(
1
,
1
)),
...
@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False):
...
@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False):
# We put only the version that implement the subsample to make the
# We put only the version that implement the subsample to make the
# test faster.
# test faster.
version_valid
=
[
-
2
,
-
1
,
1
,
3
,
11
,
12
]
version_full
=
[
-
2
,
-
1
]
verbose
=
0
verbose
=
0
random
=
True
random
=
True
print_
=
False
print_
=
False
...
@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False):
...
@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False):
if
ones
:
if
ones
:
random
=
False
random
=
False
if
conv_gemm
:
for
t
in
exec_conv
(
version_valid
,
shapes
,
verbose
,
random
,
'valid'
,
# Test the GpuCorrMM version
print_
=
print_
,
ones
=
ones
,
mode
=
theano_mode
.
including
(
"conv_gemm"
)
theano_mode
=
mode
,
cls
=
cls
):
cls
=
cuda
.
blas
.
BaseGpuCorrMM
yield
t
# dummy version; not used by GpuCorrMM so one version is enough
for
t
in
exec_conv
(
version_full
,
shapes
,
verbose
,
random
,
'full'
,
version_valid
=
version_full
=
[
-
1
]
print_
=
print_
,
ones
=
ones
,
else
:
theano_mode
=
mode
,
cls
=
cls
):
mode
=
theano_mode
yield
t
cls
=
None
exec_conv
(
version_valid
,
shapes
,
verbose
,
random
,
'valid'
,
print_
=
print_
,
ones
=
ones
,
def
test_subsample
():
theano_mode
=
mode
,
cls
=
cls
)
for
t
in
_test_subsample
(
None
,
theano_mode
,
exec_conv
(
version_full
,
shapes
,
verbose
,
random
,
'full'
,
version_valid
=
[
-
2
,
-
1
,
1
,
3
,
11
,
12
]
,
print_
=
print_
,
ones
=
ones
,
version_full
=
[
-
2
,
-
1
]):
theano_mode
=
mode
,
cls
=
cls
)
yield
t
def
test_gemm_subsample
():
def
test_gemm_subsample
():
test_subsample
(
conv_gemm
=
True
)
for
t
in
_test_subsample
(
cuda
.
blas
.
BaseGpuCorrMM
,
theano_mode
.
including
(
"conv_gemm"
)):
yield
t
def
test_dnn_subsample
():
for
t
in
_test_subsample
(
GpuDnnConv
,
theano_mode
.
including
(
'cudnn'
)):
yield
t
class
TestConv2DGPU
(
unittest
.
TestCase
):
class
TestConv2DGPU
(
unittest
.
TestCase
):
...
@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase):
...
@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode
=
theano_mode_orig
theano_mode
=
theano_mode_orig
def
gemm_directly
(
bs
,
ch
,
nf
,
rImg1
,
rImg2
,
rFlt1
,
rFlt2
,
subsx
,
subsy
,
direction
):
ishape
=
(
bs
,
ch
,
rImg1
,
rImg2
)
kshape
=
(
nf
,
ch
,
rFlt1
,
rFlt2
)
subsample
=
(
subsx
,
subsy
)
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
i
=
cuda_tensor4
()
k
=
cuda_tensor4
()
if
direction
==
'fprop'
:
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'valid'
,
subsample
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
border_mode
=
'valid'
,
subsample
=
subsample
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
gpuval
=
f
(
npy_img
,
npy_kern
[:,:,::
-
1
,::
-
1
])
elif
direction
==
'bprop img'
:
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'full'
,
subsample
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM_gradInputs
(
border_mode
=
'valid'
,
subsample
=
subsample
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
gpuval
=
f
(
npy_kern
.
transpose
(
1
,
0
,
2
,
3
),
npy_img
)
elif
direction
==
'bprop kern'
:
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'valid'
,
subsample
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM_gradWeights
(
border_mode
=
'valid'
,
subsample
=
subsample
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
gpuval
=
numpy
.
array
(
f
(
npy_img
.
transpose
(
1
,
0
,
2
,
3
),
npy_kern
.
transpose
(
1
,
0
,
2
,
3
)[:,:,::
-
1
,::
-
1
]))
.
transpose
(
1
,
0
,
2
,
3
)
assert_allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
)
def
test_gemm_directly
():
def
test_gemm_directly
():
for
direction
in
[
'fprop'
,
'bprop img'
,
'bprop kern'
]:
for
bs
in
range
(
1
,
5
):
print
'Testing direction: '
+
direction
for
ch
in
range
(
1
,
4
):
for
bs
in
range
(
1
,
5
):
for
nf
in
range
(
1
,
4
):
for
ch
in
range
(
1
,
4
):
for
rImg1
in
range
(
5
,
9
):
for
nf
in
range
(
1
,
4
):
for
rImg2
in
range
(
5
,
9
):
for
rImg1
in
range
(
5
,
9
):
for
rFlt1
in
range
(
2
,
4
):
for
rImg2
in
range
(
5
,
9
):
for
rFlt2
in
range
(
2
,
4
):
for
rFlt1
in
range
(
2
,
4
):
for
direction
in
[
'bprop img'
,
'bprop kern'
]:
for
rFlt2
in
range
(
2
,
4
):
yield
(
gemm_directly
,
bs
,
ch
,
nf
,
rImg1
,
for
subsx
in
range
(
1
,
3
)
if
direction
==
'fprop'
else
[
1
]:
rImg2
,
rFlt1
,
rFlt2
,
1
,
1
,
for
subsy
in
range
(
1
,
3
)
if
direction
==
'fprop'
else
[
1
]:
direction
)
ishape
=
(
bs
,
ch
,
rImg1
,
rImg2
)
kshape
=
(
nf
,
ch
,
rFlt1
,
rFlt2
)
for
subsx
in
range
(
1
,
3
):
subsample
=
(
subsx
,
subsy
)
for
subsy
in
range
(
1
,
3
):
yield
(
gemm_directly
,
bs
,
ch
,
nf
,
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
rImg1
,
rImg2
,
rFlt1
,
rFlt2
,
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
subsx
,
subsy
,
'fprop'
)
i
=
cuda_tensor4
()
k
=
cuda_tensor4
()
def
gemm_op
(
mode
,
subsample
):
pad
=
'full'
if
mode
==
'full'
else
(
0
,
0
)
if
direction
==
'fprop'
:
return
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
'valid'
,
subsample
,
pad
)
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'valid'
,
subsample
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
border_mode
=
'valid'
,
subsample
=
subsample
)(
i
,
k
)
def
dnn_op
(
mode
,
subsample
):
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
def
f
(
img
,
kern
):
gpuval
=
f
(
npy_img
,
npy_kern
[:,:,::
-
1
,::
-
1
])
return
dnn_conv
(
img
,
kern
,
border_mode
=
mode
,
conv_mode
=
'cross'
,
elif
direction
==
'bprop img'
:
subsample
=
subsample
)
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'full'
,
subsample
)
return
f
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM_gradInputs
(
border_mode
=
'valid'
,
subsample
=
subsample
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
def
conv_grad
(
mode
,
bs
,
ch
,
nf
,
rImg1
,
rImg2
,
rFlt1
,
rFlt2
,
subsample
,
op
):
gpuval
=
f
(
npy_kern
.
transpose
(
1
,
0
,
2
,
3
),
npy_img
)
ishape
=
(
bs
,
ch
,
rImg1
,
rImg2
)
elif
direction
==
'bprop kern'
:
kshape
=
(
nf
,
ch
,
rFlt1
,
rFlt2
)
cpuval
=
py_conv
(
npy_img
,
npy_kern
,
'valid'
,
subsample
)
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM_gradWeights
(
border_mode
=
'valid'
,
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
subsample
=
subsample
)(
i
,
k
)
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
theano_mode
)
gpuval
=
numpy
.
array
(
f
(
npy_img
.
transpose
(
1
,
0
,
2
,
3
),
i
=
cuda_tensor4
()
npy_kern
.
transpose
(
1
,
0
,
2
,
3
)[:,:,::
-
1
,::
-
1
]))
.
transpose
(
1
,
0
,
2
,
3
)
k
=
cuda_tensor4
()
if
not
numpy
.
allclose
(
cpuval
,
gpuval
,
rtol
=
1e-4
):
# TODO: also test custom pad values
print
"Test failed for"
corr_op
=
op
(
mode
,
subsample
)(
i
,
k
)
print
"direction: "
,
direction
# try to compile reference implementation without shape,
print
"ishape: "
,
ishape
# so we don't have to compile hundreds of versions
print
"kshape: "
,
kshape
conv_op
=
tensor
.
nnet
.
conv2d
(
i
,
k
[:,:,::
-
1
,::
-
1
],
print
"subsample: "
,
subsample
border_mode
=
mode
,
subsample
=
subsample
)
assert
False
try
:
conv_op_di
=
theano
.
grad
(
conv_op
.
sum
(),
i
)
conv_op_dk
=
theano
.
grad
(
conv_op
.
sum
(),
k
)
def
test_gemm_grads
():
except
Exception
:
# compile with shape information only when needed
conv_op
=
tensor
.
nnet
.
conv2d
(
i
,
k
[:,:,::
-
1
,::
-
1
],
ishape
,
kshape
,
mode
,
subsample
)
conv_op_di
=
theano
.
grad
(
conv_op
.
sum
(),
i
)
conv_op_dk
=
theano
.
grad
(
conv_op
.
sum
(),
k
)
corr_op_di
=
theano
.
grad
(
corr_op
.
sum
(),
i
)
corr_op_dk
=
theano
.
grad
(
corr_op
.
sum
(),
k
)
outputs
=
[
corr_op
,
conv_op
,
corr_op_di
,
conv_op_di
,
corr_op_dk
,
conv_op_dk
]
try
:
conv_op_dik
=
theano
.
grad
(
conv_op_di
.
sum
(),
k
)
conv_op_dki
=
theano
.
grad
(
conv_op_dk
.
sum
(),
i
)
corr_op_dik
=
theano
.
grad
(
corr_op_di
.
sum
(),
k
)
corr_op_dki
=
theano
.
grad
(
corr_op_dk
.
sum
(),
i
)
outputs
.
extend
([
corr_op_dik
,
conv_op_dik
,
corr_op_dki
,
conv_op_dki
])
except
Exception
:
# skip if the reference implementation can't do it
pass
f
=
theano
.
function
([
i
,
k
],
outputs
,
mode
=
theano_mode
)
allvals
=
f
(
npy_img
,
npy_kern
)
for
a
,
b
,
p
in
zip
(
allvals
[::
2
],
allvals
[
1
::
2
],
(
'top'
,
'dtop/dbottom'
,
'dtop/dweight'
,
'dtop/dbottom/dweight'
,
'dtop/dweight/dbottom'
)):
assert_allclose
(
a
,
b
,
rtol
=
1e-4
)
def
test_conv_grads
():
for
mode
in
'valid'
,
'full'
:
for
mode
in
'valid'
,
'full'
:
for
bs
in
[
1
,
5
]:
for
bs
in
[
1
,
5
]:
for
ch
in
[
4
]:
for
ch
in
[
4
]:
...
@@ -910,68 +912,11 @@ def test_gemm_grads():
...
@@ -910,68 +912,11 @@ def test_gemm_grads():
for
rImg2
in
[
2
,
8
]:
for
rImg2
in
[
2
,
8
]:
for
rFlt1
in
[
1
,
2
]:
for
rFlt1
in
[
1
,
2
]:
for
rFlt2
in
[
1
,
2
]:
for
rFlt2
in
[
1
,
2
]:
for
subsx
in
[
1
,
2
]:
for
subsample
in
(
1
,
1
),
(
1
,
2
),
(
2
,
2
):
for
subsy
in
[
1
,
2
]
if
subsx
==
1
else
[
2
]:
for
op
in
[
gemm_op
,
dnn_op
]:
ishape
=
(
bs
,
ch
,
rImg1
,
rImg2
)
yield
(
conv_grad
,
mode
,
bs
,
ch
,
nf
,
kshape
=
(
nf
,
ch
,
rFlt1
,
rFlt2
)
rImg1
,
rImg2
,
rFlt1
,
rFlt2
,
subsample
=
(
subsx
,
subsy
)
subsample
,
op
)
npy_img
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
ishape
),
dtype
=
'float32'
)
npy_kern
=
theano
.
_asarray
(
numpy
.
random
.
rand
(
*
kshape
),
dtype
=
'float32'
)
i
=
cuda_tensor4
()
k
=
cuda_tensor4
()
pad
=
'full'
if
mode
==
'full'
else
(
0
,
0
)
# TODO: also test custom pad values
corr_op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuCorrMM
(
'valid'
,
subsample
,
pad
)(
i
,
k
)
# try to compile reference implementation without shape,
# so we don't have to compile hundreds of versions
conv_op
=
tensor
.
nnet
.
conv2d
(
i
,
k
[:,:,::
-
1
,::
-
1
],
border_mode
=
mode
,
subsample
=
subsample
)
try
:
conv_op_di
=
theano
.
grad
(
conv_op
.
sum
(),
i
)
conv_op_dk
=
theano
.
grad
(
conv_op
.
sum
(),
k
)
except
Exception
:
# compile with shape information only when needed
conv_op
=
tensor
.
nnet
.
conv2d
(
i
,
k
[:,:,::
-
1
,::
-
1
],
ishape
,
kshape
,
mode
,
subsample
)
conv_op_di
=
theano
.
grad
(
conv_op
.
sum
(),
i
)
conv_op_dk
=
theano
.
grad
(
conv_op
.
sum
(),
k
)
corr_op_di
=
theano
.
grad
(
corr_op
.
sum
(),
i
)
corr_op_dk
=
theano
.
grad
(
corr_op
.
sum
(),
k
)
outputs
=
[
corr_op
,
conv_op
,
corr_op_di
,
conv_op_di
,
corr_op_dk
,
conv_op_dk
]
try
:
conv_op_dik
=
theano
.
grad
(
conv_op_di
.
sum
(),
k
)
conv_op_dki
=
theano
.
grad
(
conv_op_dk
.
sum
(),
i
)
except
Exception
:
# skip if the reference implementation can't do it
print
"."
,
else
:
corr_op_dik
=
theano
.
grad
(
corr_op_di
.
sum
(),
k
)
corr_op_dki
=
theano
.
grad
(
corr_op_dk
.
sum
(),
i
)
outputs
.
extend
([
corr_op_dik
,
conv_op_dik
,
corr_op_dki
,
conv_op_dki
])
print
":"
,
f
=
theano
.
function
([
i
,
k
],
outputs
,
mode
=
theano_mode
)
allvals
=
f
(
npy_img
,
npy_kern
)
for
a
,
b
,
p
in
zip
(
allvals
[::
2
],
allvals
[
1
::
2
],
(
'top'
,
'dtop/dbottom'
,
'dtop/dweight'
,
'dtop/dbottom/dweight'
,
'dtop/dweight/dbottom'
)):
if
(
a
.
shape
!=
b
.
shape
)
or
not
numpy
.
allclose
(
a
,
b
,
rtol
=
1e-4
):
print
"Test failed for"
,
p
print
"mode: "
,
mode
print
"ishape: "
,
ishape
print
"kshape: "
,
kshape
print
"subsample: "
,
subsample
assert
False
sys
.
stdout
.
flush
()
def
benchmark
():
def
benchmark
():
...
@@ -1017,15 +962,15 @@ def benchmark():
...
@@ -1017,15 +962,15 @@ def benchmark():
,((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
#full conv_reference_full
,((
2
,
30
,
116
,
116
),
(
20
,
30
,
9
,
9
),
(
1
,
1
),
(
1
,
1
),
(
1
,
1
))
#full conv_reference_full
]
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version
=
[
-
1
]
version
=
[
-
1
]
verbose
=
1
verbose
=
1
random
=
True
random
=
True
exec_conv
(
version
,
shapes_valid
,
verbose
,
random
,
'valid'
,
for
t
in
exec_conv
(
version
,
shapes_valid
,
verbose
,
random
,
'valid'
,
print_
=
None
,
rtol
=
1e-3
)
print_
=
None
,
rtol
=
1e-3
):
exec_conv
(
version
,
shapes_full
,
verbose
,
random
,
'full'
)
t
[
0
](
*
t
[
1
:])
for
t
in
exec_conv
(
version
,
shapes_full
,
verbose
,
random
,
'full'
):
t
[
0
](
*
t
[
1
:])
def
test_stack_rows_segfault_070312
():
def
test_stack_rows_segfault_070312
():
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论