Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
5fc89c03
提交
5fc89c03
authored
12月 14, 2013
作者:
Frederic
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
new GpuConv compile, but give wrong version in some cases!
上级
baf12f54
隐藏空白字符变更
内嵌
并排
正在显示
5 个修改的文件
包含
355 行增加
和
330 行删除
+355
-330
conv.cu
theano/sandbox/gpuarray/conv.cu
+296
-295
conv.py
theano/sandbox/gpuarray/conv.py
+28
-6
conv_full_kernel.cu
theano/sandbox/gpuarray/conv_full_kernel.cu
+7
-6
conv_kernel.cu
theano/sandbox/gpuarray/conv_kernel.cu
+17
-17
test_conv_cuda_ndarray.py
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+7
-6
没有找到文件。
theano/sandbox/gpuarray/conv.cu
浏览文件 @
5fc89c03
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
//TODO detect SHARED_SIZE dynamically
#define SHARED_SIZE (16*1024)
enum
{
ConvMode_FULL
,
ConvMode_VALID
};
PyObject
*
CudaNdarray_Conv
(
CudaNdarray
*
img
,
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
const
int
mode
,
const
int
subsample_rows
,
const
int
subsample_cols
,
const
int
version
,
const
int
verbose
);
PyObject
*
PyGpuArray_Conv
(
PyGpuArrayObject
*
img
,
PyGpuArrayObject
*
kern
,
PyGpuArrayObject
*
out
,
const
int
mode
,
const
size_t
subsample_rows
,
const
size_t
subsample_cols
,
const
int
version
,
const
int
verbose
);
template
<
typename
T
>
static
T
ceil_intdiv
(
T
a
,
T
b
)
{
return
(
a
/
b
)
+
((
a
%
b
)
?
1
:
0
);
}
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
*/
int
CudaNdarray_conv_valid
(
const
CudaNdarray
*
img
,
const
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
int
subsample_rows
,
in
t
subsample_cols
,
PyGpuArray_conv_valid
(
const
PyGpuArrayObject
*
img
,
const
PyGpuArrayObject
*
kern
,
PyGpuArrayObject
*
out
,
size_t
subsample_rows
,
size_
t
subsample_cols
,
int
version
=
-
1
,
int
verbose
=
0
,
int
max_threads_dim0
=
512
)
{
int
work_complete
=
0
;
const
int
shared_avail
=
SHARED_SIZE
-
150
;
//144 is the biggest static shared size used with compiling this file.
if
(
img
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
img
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required img of 4D"
);
return
-
1
;
}
if
(
kern
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
kern
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required kern of 4D"
);
return
-
1
;
}
if
(
out
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
out
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required out of 4D"
);
return
-
1
;
...
...
@@ -40,40 +50,40 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
version
,
THEANO_KERN_WID
);
fprintf
(
stderr
,
"INFO: img dim: %i %i %i %i img stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
img
)[
0
],
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
CudaNdarray_HOST_DIMS
(
img
)[
2
],
CudaNdarray_HOST
_DIMS
(
img
)[
3
],
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
);
PyGpuArray_DIMS
(
img
)[
0
],
PyGpuArray
_DIMS
(
img
)[
1
],
PyGpuArray_DIMS
(
img
)[
2
],
PyGpuArray
_DIMS
(
img
)[
3
],
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
fprintf
(
stderr
,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
kern
)[
0
],
CudaNdarray_HOST
_DIMS
(
kern
)[
1
],
CudaNdarray_HOST_DIMS
(
kern
)[
2
],
CudaNdarray_HOST
_DIMS
(
kern
)[
3
],
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
);
PyGpuArray_DIMS
(
kern
)[
0
],
PyGpuArray
_DIMS
(
kern
)[
1
],
PyGpuArray_DIMS
(
kern
)[
2
],
PyGpuArray
_DIMS
(
kern
)[
3
],
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
fprintf
(
stderr
,
"INFO: out dim: %i %i %i %i out stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
out
)[
0
],
CudaNdarray_HOST
_DIMS
(
out
)[
1
],
CudaNdarray_HOST_DIMS
(
out
)[
2
],
CudaNdarray_HOST
_DIMS
(
out
)[
3
],
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
);
PyGpuArray_DIMS
(
out
)[
0
],
PyGpuArray
_DIMS
(
out
)[
1
],
PyGpuArray_DIMS
(
out
)[
2
],
PyGpuArray
_DIMS
(
out
)[
3
],
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
fprintf
(
stderr
,
"INFO: subsample_rows=%d, subsample_cols=%d
\n
"
,
subsample_rows
,
subsample_cols
);
}
//Check the output size is valid
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
2
]
==
ceil_intdiv
(
CudaNdarray_HOST_DIMS
(
img
)[
2
]
-
CudaNdarray_HOST
_DIMS
(
kern
)[
2
]
+
1
,
subsample_rows
));
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
3
]
==
ceil_intdiv
(
CudaNdarray_HOST_DIMS
(
img
)[
3
]
-
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]
+
1
,
subsample_cols
));
assert
(
PyGpuArray_DIMS
(
out
)[
2
]
==
ceil_intdiv
(
PyGpuArray_DIMS
(
img
)[
2
]
-
PyGpuArray
_DIMS
(
kern
)[
2
]
+
1
,
subsample_rows
));
assert
(
PyGpuArray_DIMS
(
out
)[
3
]
==
ceil_intdiv
(
PyGpuArray_DIMS
(
img
)[
3
]
-
PyGpuArray
_DIMS
(
kern
)[
3
]
+
1
,
subsample_cols
));
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
0
]
==
CudaNdarray_HOST
_DIMS
(
img
)[
0
]);
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
1
]
==
CudaNdarray_HOST
_DIMS
(
kern
)[
0
]);
assert
(
CudaNdarray_HOST_DIMS
(
img
)[
1
]
==
CudaNdarray_HOST
_DIMS
(
kern
)[
1
]);
assert
(
PyGpuArray_DIMS
(
out
)[
0
]
==
PyGpuArray
_DIMS
(
img
)[
0
]);
assert
(
PyGpuArray_DIMS
(
out
)[
1
]
==
PyGpuArray
_DIMS
(
kern
)[
0
]);
assert
(
PyGpuArray_DIMS
(
img
)[
1
]
==
PyGpuArray
_DIMS
(
kern
)[
1
]);
// we now search through a few implementations until one applies to our arguments.
...
...
@@ -82,24 +92,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const
int
nstack
=
CudaNdarray_HOST
_DIMS
(
kern
)[
1
];
const
int
nbatch
=
CudaNdarray_HOST
_DIMS
(
img
)[
0
];
const
int
nkern
=
CudaNdarray_HOST
_DIMS
(
kern
)[
0
];
const
int
img_wid
=
CudaNdarray_HOST
_DIMS
(
img
)[
3
];
const
int
img_len
=
CudaNdarray_HOST
_DIMS
(
img
)[
2
];
const
int
kern_wid
=
CudaNdarray_HOST
_DIMS
(
kern
)[
3
];
const
int
kern_len
=
CudaNdarray_HOST
_DIMS
(
kern
)[
2
];
const
int
out_wid
=
CudaNdarray_HOST
_DIMS
(
out
)[
3
];
const
int
out_len
=
CudaNdarray_HOST
_DIMS
(
out
)[
2
];
const
int
img_stride_col
=
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
;
const
int
img_stride_row
=
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
;
const
int
img_stride_stack
=
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
;
const
int
img_stride_batch
=
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
;
const
int
kern_stride_col
=
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
;
const
int
kern_stride_row
=
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
;
const
int
kern_stride_stack
=
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
;
const
int
kern_stride_nkern
=
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
;
const
int
nstack
=
PyGpuArray
_DIMS
(
kern
)[
1
];
const
int
nbatch
=
PyGpuArray
_DIMS
(
img
)[
0
];
const
int
nkern
=
PyGpuArray
_DIMS
(
kern
)[
0
];
const
int
img_wid
=
PyGpuArray
_DIMS
(
img
)[
3
];
const
int
img_len
=
PyGpuArray
_DIMS
(
img
)[
2
];
const
int
kern_wid
=
PyGpuArray
_DIMS
(
kern
)[
3
];
const
int
kern_len
=
PyGpuArray
_DIMS
(
kern
)[
2
];
const
int
out_wid
=
PyGpuArray
_DIMS
(
out
)[
3
];
const
int
out_len
=
PyGpuArray
_DIMS
(
out
)[
2
];
const
int
img_stride_col
=
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
;
const
int
img_stride_row
=
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
;
const
int
img_stride_stack
=
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
;
const
int
img_stride_batch
=
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
;
const
int
kern_stride_col
=
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
;
const
int
kern_stride_row
=
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
;
const
int
kern_stride_stack
=
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
;
const
int
kern_stride_nkern
=
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
;
const
int
img_size
=
img_len
*
img_wid
;
const
int
kern_size
=
kern_len
*
kern_wid
;
...
...
@@ -107,17 +117,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
const
int
img_size_byte
=
img_size
*
sizeof
(
float
);
const
int
kern_size_byte
=
kern_size
*
sizeof
(
float
);
const
int
out_size_byte
=
out_size
*
sizeof
(
float
);
if
(
!
((
THEANO_KERN_WID
==
CudaNdarray_HOST
_DIMS
(
kern
)[
3
])
||
(
THEANO_KERN_WID
==
0
))){
if
(
!
((
THEANO_KERN_WID
==
PyGpuArray
_DIMS
(
kern
)[
3
])
||
(
THEANO_KERN_WID
==
0
))){
PyErr_Format
(
PyExc_ValueError
,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %d columns!"
,
THEANO_KERN_WID
,
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]);
" %d kernel columns, but the kernel we received had %
u
d columns!"
,
THEANO_KERN_WID
,
PyGpuArray
_DIMS
(
kern
)[
3
]);
return
-
1
;
}
bool
subsample
=
subsample_rows
!=
1
||
subsample_cols
!=
1
;
bool
img_contiguous
=
CudaNdarray_is_c_contiguous
(
img
)
;
bool
kern_contiguous
=
CudaNdarray_is_c_contiguous
(
kern
)
;
bool
out_contiguous
=
CudaNdarray_is_c_contiguous
(
out
)
;
bool
img_contiguous
=
img
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
kern_contiguous
=
kern
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
out_contiguous
=
out
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
c_contiguous
=
img_contiguous
&&
kern_contiguous
&&
out_contiguous
;
bool
img_contiguous_2d
=
(
img_stride_col
==
1
)
&&
(
img_stride_row
==
img_wid
);
...
...
@@ -130,7 +140,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//we don't need to unflip it, but have the new value when we unflip it.
bool
kern_flipped
=
true
;
bool
kern_contiguous_2d_unflipped
=
kern_contiguous_2d
;
float
*
kern_data_unflipped
=
kern
->
devdata
;
const
float
*
kern_data_unflipped
=
cuda_get_ptr
(
kern
)
;
int
kern_stride_col_unflipped
=
kern_stride_col
;
int
kern_stride_row_unflipped
=
kern_stride_row
;
if
(
kern_stride_col_unflipped
==-
1
&&
kern_stride_row_unflipped
==-
kern_wid
){
...
...
@@ -139,7 +149,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_stride_row_unflipped
=
kern_wid
;
kern_flipped
=
false
;
kern_contiguous_2d_unflipped
=
true
;
kern_data_unflipped
=&
(
kern
->
devdata
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
kern_data_unflipped
=&
(
cuda_get_ptr
(
kern
)
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
}
//if we remove the restriction
...
...
@@ -173,7 +183,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3
grid
(
nbatch
,
nkern
);
int
shared_size
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
);
...
...
@@ -184,9 +194,9 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_PATCH_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -234,7 +244,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3
grid
(
nbatch
,
nkern
);
int
shared_size
=
(
img_size
+
(
preload_full_kernel
?
kern_size
:
kern_wid
))
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -277,14 +287,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_PATCH_STACK_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
,
subsample_rows
,
subsample_cols
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -346,7 +355,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3
threads
(
out_wid
);
dim3
grid
(
out_len
,
nbatch
*
nkern
);
int
shared_size
=
(
kern_len
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -358,14 +367,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_ROWS_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
),
cuda_get_ptr
(
kern
),
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -408,7 +416,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int
shared_size
=
((
kern_len
+
nb_row
-
1
)
*
img_wid
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -430,16 +438,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
}
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
)
,
cuda_get_ptr
(
kern
)
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -503,7 +510,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int
shared_size
=
(
threads
.
y
*
img_wid
+
k_size
)
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -518,16 +525,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_ROWS_STACK2_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
)
,
cuda_get_ptr
(
kern
)
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -626,7 +632,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
dim3
threads
(
out_wid
,
out_len
,
thread_z
);
dim3
grid
(
nbatch
,
nkern
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -657,13 +663,13 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern_data_unflipped
,
out
->
devdata
,
f
<<<
grid
,
threads
,
shared_size
>>>
(
cuda_get_ptr
(
img
),
kern_data_unflipped
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col_unflipped
,
kern_stride_row_unflipped
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -705,8 +711,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_len
<=
320
&&
!
work_complete
)
//conv_valid_row_reduce
{
int
outsize
=
CudaNda
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
NUM_VECTOR_OP_BLOCKS
);
int
outsize
=
PyGpuA
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
int
block_nstack
=
nstack
;
//Max of 512 threads per blocks.
...
...
@@ -736,8 +742,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
void
(
*
f
)(
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
float
*
,
int
,
int
,
int
,
int
,
float
*
,
int
,
int
,
int
,
int
,
const
float
*
,
int
,
int
,
int
,
int
,
const
float
*
,
int
,
int
,
int
,
int
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
...
...
@@ -749,23 +755,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else
f
=
conv_valid_row_reduce
<
true
>
;
f
<<<
n_blocks
,
n_threads
,
n_reduce_buf
>>>
(
nbatch
,
nkern
,
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
nbatch
,
nkern
,
PyGpuArray
_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
img
->
devdata
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
],
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
cuda_get_ptr
(
img
)
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
img_stride_row
,
img_stride_col
,
kern
->
devdata
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
],
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
],
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
,
out
->
devdata
,
CudaNdarray_HOST_STRIDES
(
out
)[
0
],
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
],
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
,
cuda_get_ptr
(
kern
)
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
)
,
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
,
initial_reduce_boundary
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -791,65 +795,64 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if
(
1
&&
!
work_complete
)
//conv_reference_valid
{
int
outsize
=
CudaNda
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
NUM_VECTOR_OP_BLOCKS
);
int
outsize
=
PyGpuA
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
int
n_threads
=
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
),
NUM_VECTOR_OP_THREADS_PER_BLOCK
);
256
);
if
(
1
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: launching conv_reference_valid
\n
"
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" img : %i %i %i %i %p %i %i %i %i
\n
"
,
nbatch
,
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
img
->
devdata
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
);
nbatch
,
PyGpuArray
_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
cuda_get_ptr
(
img
)
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" kern: %i %i %i %i %p %i %i %i %i
\n
"
,
nkern
,
nstack
,
kern_len
,
kern_wid
,
kern
->
devdata
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
);
cuda_get_ptr
(
kern
)
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" out : %i %i %i %i %p %i %i %i %i
\n
"
,
CudaNdarray_HOST
_DIMS
(
out
)[
0
],
CudaNdarray_HOST
_DIMS
(
out
)[
1
],
out_len
,
out_wid
,
out
->
devdata
,
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
);
PyGpuArray
_DIMS
(
out
)[
0
],
PyGpuArray
_DIMS
(
out
)[
1
],
out_len
,
out_wid
,
cuda_get_ptr
(
out
)
,
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
if
(
verbose
>
1
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
,
n_threads
);
}
conv_reference_valid
<<<
n_blocks
,
n_threads
>>>
(
nbatch
,
nkern
,
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
PyGpuArray
_DIMS
(
img
)[
1
],
img_len
,
img_wid
,
kern_len
,
kern_wid
,
out_len
,
out_wid
,
img
->
devdata
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
,
kern
->
devdata
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
,
out
->
devdata
,
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
,
cuda_get_ptr
(
img
)
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
cuda_get_ptr
(
kern
)
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
)
,
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
...
...
@@ -864,7 +867,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
fprintf
(
stderr
,
"INFO: 'conv_reference_valid' failed
\n
"
);
PyErr_Format
(
PyExc_RuntimeError
,
"ERROR: all implementations failed for"
"
CudaNda
rray_conv_valid! (%s)"
,
"
PyGpuA
rray_conv_valid! (%s)"
,
cudaGetErrorString
(
sts
));
return
-
1
;
}
...
...
@@ -873,7 +876,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{
PyErr_Format
(
PyExc_RuntimeError
,
"ERROR: no implementation(s) worked for"
"
CudaNda
rray_conv_valid!"
"
PyGpuA
rray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)"
,
version
);
return
-
1
;
...
...
@@ -882,56 +885,56 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
}
int
CudaNdarray_conv_full
(
const
CudaNdarray
*
img
,
const
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
in
t
subsample_rows
,
in
t
subsample_cols
,
int
version
=
-
1
,
int
verbose
=
0
,
PyGpuArray_conv_full
(
const
PyGpuArrayObject
*
img
,
const
PyGpuArrayObject
*
kern
,
PyGpuArrayObject
*
out
,
size_
t
subsample_rows
,
size_
t
subsample_cols
,
int
version
=
-
1
,
int
verbose
=
0
,
int
max_threads_dim0
=
512
)
{
//144 is the biggest static shared size used with compiling this file.
const
int
shared_avail
=
SHARED_SIZE
-
150
;
int
work_complete
=
0
;
if
(
img
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
img
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required img of 4D"
);
return
-
1
;
}
if
(
kern
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
kern
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required kern of 4D"
);
return
-
1
;
}
if
(
out
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
out
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"required out of 4D"
);
return
-
1
;
}
// check the size of the output matrix
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
2
]
==
ceil_intdiv
(
CudaNdarray_HOST_DIMS
(
img
)[
2
]
+
CudaNdarray_HOST
_DIMS
(
kern
)[
2
]
-
1
,
subsample_rows
));
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
3
]
==
ceil_intdiv
(
CudaNdarray_HOST_DIMS
(
img
)[
3
]
+
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]
-
1
,
subsample_cols
));
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
0
]
==
CudaNdarray_HOST
_DIMS
(
img
)[
0
]);
assert
(
CudaNdarray_HOST_DIMS
(
out
)[
1
]
==
CudaNdarray_HOST
_DIMS
(
kern
)[
0
]);
assert
(
CudaNdarray_HOST_DIMS
(
img
)[
1
]
==
CudaNdarray_HOST
_DIMS
(
kern
)[
1
]);
const
int
nstack
=
CudaNdarray_HOST
_DIMS
(
kern
)[
1
];
const
int
nbatch
=
CudaNdarray_HOST
_DIMS
(
img
)[
0
];
const
int
nkern
=
CudaNdarray_HOST
_DIMS
(
kern
)[
0
];
const
int
img_wid
=
CudaNdarray_HOST
_DIMS
(
img
)[
3
];
const
int
img_len
=
CudaNdarray_HOST
_DIMS
(
img
)[
2
];
const
int
kern_wid
=
CudaNdarray_HOST
_DIMS
(
kern
)[
3
];
const
int
kern_len
=
CudaNdarray_HOST
_DIMS
(
kern
)[
2
];
const
int
out_wid
=
CudaNdarray_HOST
_DIMS
(
out
)[
3
];
const
int
out_len
=
CudaNdarray_HOST
_DIMS
(
out
)[
2
];
const
int
img_stride_col
=
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
;
const
int
img_stride_row
=
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
;
const
int
img_stride_stack
=
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
;
const
int
img_stride_batch
=
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
;
const
int
kern_stride_col
=
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
;
const
int
kern_stride_row
=
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
;
const
int
kern_stride_stack
=
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
;
const
int
kern_stride_nkern
=
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
;
assert
(
PyGpuArray_DIMS
(
out
)[
2
]
==
ceil_intdiv
(
PyGpuArray_DIMS
(
img
)[
2
]
+
PyGpuArray
_DIMS
(
kern
)[
2
]
-
1
,
subsample_rows
));
assert
(
PyGpuArray_DIMS
(
out
)[
3
]
==
ceil_intdiv
(
PyGpuArray_DIMS
(
img
)[
3
]
+
PyGpuArray
_DIMS
(
kern
)[
3
]
-
1
,
subsample_cols
));
assert
(
PyGpuArray_DIMS
(
out
)[
0
]
==
PyGpuArray
_DIMS
(
img
)[
0
]);
assert
(
PyGpuArray_DIMS
(
out
)[
1
]
==
PyGpuArray
_DIMS
(
kern
)[
0
]);
assert
(
PyGpuArray_DIMS
(
img
)[
1
]
==
PyGpuArray
_DIMS
(
kern
)[
1
]);
const
int
nstack
=
PyGpuArray
_DIMS
(
kern
)[
1
];
const
int
nbatch
=
PyGpuArray
_DIMS
(
img
)[
0
];
const
int
nkern
=
PyGpuArray
_DIMS
(
kern
)[
0
];
const
int
img_wid
=
PyGpuArray
_DIMS
(
img
)[
3
];
const
int
img_len
=
PyGpuArray
_DIMS
(
img
)[
2
];
const
int
kern_wid
=
PyGpuArray
_DIMS
(
kern
)[
3
];
const
int
kern_len
=
PyGpuArray
_DIMS
(
kern
)[
2
];
const
int
out_wid
=
PyGpuArray
_DIMS
(
out
)[
3
];
const
int
out_len
=
PyGpuArray
_DIMS
(
out
)[
2
];
const
int
img_stride_col
=
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
;
const
int
img_stride_row
=
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
;
const
int
img_stride_stack
=
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
;
const
int
img_stride_batch
=
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
;
const
int
kern_stride_col
=
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
;
const
int
kern_stride_row
=
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
;
const
int
kern_stride_stack
=
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
;
const
int
kern_stride_nkern
=
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
;
const
int
img_size
=
img_len
*
img_wid
;
const
int
kern_size
=
kern_len
*
kern_wid
;
...
...
@@ -946,20 +949,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//const int out_size_byte = out_size*sizeof(float); // unused
if
(
!
((
THEANO_KERN_WID
==
CudaNdarray_HOST
_DIMS
(
kern
)[
3
])
||
if
(
!
((
THEANO_KERN_WID
==
PyGpuArray
_DIMS
(
kern
)[
3
])
||
(
THEANO_KERN_WID
==
0
))){
PyErr_Format
(
PyExc_ValueError
,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %d columns!"
,
THEANO_KERN_WID
,
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]);
" had %
u
d columns!"
,
THEANO_KERN_WID
,
PyGpuArray
_DIMS
(
kern
)[
3
]);
return
-
1
;
}
bool
subsample
=
subsample_rows
!=
1
||
subsample_cols
!=
1
;
bool
img_contiguous
=
CudaNdarray_is_c_contiguous
(
img
)
;
bool
kern_contiguous
=
CudaNdarray_is_c_contiguous
(
kern
)
;
bool
out_contiguous
=
CudaNdarray_is_c_contiguous
(
out
)
;
bool
img_contiguous
=
img
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
kern_contiguous
=
kern
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
out_contiguous
=
out
->
ga
.
flags
&
GA_C_CONTIGUOUS
;
bool
c_contiguous
=
img_contiguous
&&
kern_contiguous
&&
out_contiguous
;
bool
img_contiguous_2d
=
(
img_stride_col
==
1
)
&&
(
img_stride_row
==
img_wid
);
...
...
@@ -974,7 +977,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//we don't need to unflip it, but have the new value when we unflip it.
bool
kern_flipped
=
true
;
bool
kern_contiguous_2d_unflipped
=
kern_contiguous_2d
;
float
*
kern_data_unflipped
=
kern
->
devdata
;
const
float
*
kern_data_unflipped
=
cuda_get_ptr
(
kern
)
;
int
kern_stride_col_unflipped
=
kern_stride_col
;
int
kern_stride_row_unflipped
=
kern_stride_row
;
if
(
kern_stride_col_unflipped
==-
1
&&
kern_stride_row_unflipped
==-
kern_wid
){
...
...
@@ -983,7 +986,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
kern_stride_row_unflipped
=
kern_wid
;
kern_flipped
=
false
;
kern_contiguous_2d_unflipped
=
true
;
kern_data_unflipped
=&
(
kern
->
devdata
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
kern_data_unflipped
=&
(
cuda_get_ptr
(
kern
)
[(
kern_wid
-
1
)
*
kern_stride_col
+
(
kern_len
-
1
)
*
kern_stride_row
]);
}
if
(
verbose
>
1
)
...
...
@@ -991,26 +994,26 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
printf
(
"INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:
\n
"
,
version
,
THEANO_KERN_WID
);
printf
(
"INFO: img dim: %i %i %i %i img stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
img
)[
0
],
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
CudaNdarray_HOST_DIMS
(
img
)[
2
],
CudaNdarray_HOST
_DIMS
(
img
)[
3
],
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
);
PyGpuArray_DIMS
(
img
)[
0
],
PyGpuArray
_DIMS
(
img
)[
1
],
PyGpuArray_DIMS
(
img
)[
2
],
PyGpuArray
_DIMS
(
img
)[
3
],
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
printf
(
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
kern
)[
0
],
CudaNdarray_HOST
_DIMS
(
kern
)[
1
],
CudaNdarray_HOST_DIMS
(
kern
)[
2
],
CudaNdarray_HOST
_DIMS
(
kern
)[
3
],
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
);
PyGpuArray_DIMS
(
kern
)[
0
],
PyGpuArray
_DIMS
(
kern
)[
1
],
PyGpuArray_DIMS
(
kern
)[
2
],
PyGpuArray
_DIMS
(
kern
)[
3
],
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
printf
(
"INFO: out dim: %i %i %i %i out stride: %i %i %i %i
\n
"
,
CudaNdarray_HOST_DIMS
(
out
)[
0
],
CudaNdarray_HOST
_DIMS
(
out
)[
1
],
CudaNdarray_HOST_DIMS
(
out
)[
2
],
CudaNdarray_HOST
_DIMS
(
out
)[
3
],
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
);
PyGpuArray_DIMS
(
out
)[
0
],
PyGpuArray
_DIMS
(
out
)[
1
],
PyGpuArray_DIMS
(
out
)[
2
],
PyGpuArray
_DIMS
(
out
)[
3
],
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
}
if
(
!
subsample
&&
...
...
@@ -1063,7 +1066,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
int
shared_size
=
img_size_padded_byte
+
kern_size_byte
;
if
(
version
==
5
)
shared_size
=
((
kern_len
+
threads
.
y
-
1
)
+
2
*
kern_len
-
2
)
*
img_wid_padded
*
sizeof
(
float
)
+
kern_size_byte
;
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
...
...
@@ -1087,13 +1090,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CONV_FULL_PATCH_STACK_PADDED_SPECIAL
(
THEANO_KERN_WID
);
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern_data_unflipped
,
out
->
devdata
,
(
cuda_get_ptr
(
img
),
kern_data_unflipped
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
img_stride_stack
,
img_stride_batch
,
kern_stride_col_unflipped
,
kern_stride_row_unflipped
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -1147,14 +1149,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
)
,
cuda_get_ptr
(
kern
)
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -1189,30 +1190,29 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
)
=
conv_full_load_everything
<
0
>
;
f
=
conv_full_load_everything
<
THEANO_KERN_WID
>
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
(
cuda_get_ptr
(
img
)
,
cuda_get_ptr
(
kern
)
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -1246,7 +1246,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
dim3
grid
(
nbatch
,
nkern
);
int
shared_size
=
(
img_size
+
kern_size
)
*
sizeof
(
float
);
void
(
*
f
)(
float
*
,
float
*
,
float
*
,
void
(
*
f
)(
const
float
*
,
const
float
*
,
float
*
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
...
...
@@ -1257,15 +1257,15 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
else
if
(
!
img_contiguous_2d
&&
!
kern_contiguous_2d
)
f
=
conv_full_patch_stack
<
false
,
false
>
;
f
<<<
grid
,
threads
,
shared_size
>>>
(
img
->
devdata
,
kern
->
devdata
,
out
->
devdata
,
cuda_get_ptr
(
img
)
,
cuda_get_ptr
(
kern
)
,
cuda_get_ptr
(
out
)
,
img_len
,
img_wid
,
kern_len
,
kern_wid
,
nkern
,
nstack
,
img_stride_col
,
img_stride_row
,
kern_stride_col
,
kern_stride_row
,
kern_stride_stack
,
kern_stride_nkern
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
{
...
...
@@ -1290,48 +1290,48 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
{
if
(
verbose
>
1
)
fprintf
(
stderr
,
"INFO: will start conv_reference_full
\n
"
);
int
outsize
=
CudaNda
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
NUM_VECTOR_OP_BLOCKS
);
int
outsize
=
PyGpuA
rray_SIZE
(
out
);
int
n_blocks
=
std
::
min
(
outsize
,
4096
);
int
n_threads
=
std
::
min
(
ceil_intdiv
(
outsize
,
n_blocks
),
NUM_VECTOR_OP_THREADS_PER_BLOCK
);
256
);
if
(
0
)
{
if
(
verbose
)
fprintf
(
stderr
,
"INFO: launching conv_reference_valid
\n
"
);
if
(
verbose
)
fprintf
(
stderr
,
" img : %i %i %i %i %p %i %i %i %i
\n
"
,
CudaNdarray_HOST
_DIMS
(
img
)[
0
],
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
CudaNdarray_HOST
_DIMS
(
img
)[
2
],
CudaNdarray_HOST
_DIMS
(
img
)[
3
],
img
->
devdata
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
);
PyGpuArray
_DIMS
(
img
)[
0
],
PyGpuArray
_DIMS
(
img
)[
1
],
PyGpuArray
_DIMS
(
img
)[
2
],
PyGpuArray
_DIMS
(
img
)[
3
],
cuda_get_ptr
(
img
)
,
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
);
if
(
verbose
)
fprintf
(
stderr
,
" kern: %i %i %i %i %p %i %i %i %i
\n
"
,
CudaNdarray_HOST
_DIMS
(
kern
)[
0
],
CudaNdarray_HOST
_DIMS
(
kern
)[
1
],
CudaNdarray_HOST
_DIMS
(
kern
)[
2
],
CudaNdarray_HOST
_DIMS
(
kern
)[
3
],
kern
->
devdata
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
PyGpuArray
_DIMS
(
kern
)[
0
],
PyGpuArray
_DIMS
(
kern
)[
1
],
PyGpuArray
_DIMS
(
kern
)[
2
],
PyGpuArray
_DIMS
(
kern
)[
3
],
cuda_get_ptr
(
kern
)
,
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
);
if
(
verbose
)
fprintf
(
stderr
,
" out : %i %i %i %i %p %i %i %i %i
\n
"
,
CudaNdarray_HOST
_DIMS
(
out
)[
0
],
CudaNdarray_HOST
_DIMS
(
out
)[
1
],
CudaNdarray_HOST
_DIMS
(
out
)[
2
],
CudaNdarray_HOST
_DIMS
(
out
)[
3
],
out
->
devdata
,
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
);
PyGpuArray
_DIMS
(
out
)[
0
],
PyGpuArray
_DIMS
(
out
)[
1
],
PyGpuArray
_DIMS
(
out
)[
2
],
PyGpuArray
_DIMS
(
out
)[
3
],
cuda_get_ptr
(
out
)
,
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
);
if
(
verbose
)
fprintf
(
stderr
,
" launch params: %i %i %i
\n
"
,
outsize
,
n_blocks
,
n_threads
);
...
...
@@ -1340,25 +1340,24 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
subsample_rows
,
subsample_cols
);
}
conv_reference_full
<<<
n_blocks
,
n_threads
>>>
(
CudaNdarray_HOST_DIMS
(
img
)[
0
],
CudaNdarray_HOST
_DIMS
(
kern
)[
0
],
CudaNdarray_HOST
_DIMS
(
img
)[
1
],
CudaNdarray_HOST_DIMS
(
img
)[
2
],
CudaNdarray_HOST
_DIMS
(
img
)[
3
],
CudaNdarray_HOST_DIMS
(
kern
)[
2
],
CudaNdarray_HOST
_DIMS
(
kern
)[
3
],
CudaNdarray_HOST_DIMS
(
out
)[
2
],
CudaNdarray_HOST
_DIMS
(
out
)[
3
],
img
->
devdata
,
CudaNdarray_HOST_STRIDES
(
img
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
img
)[
3
]
,
kern
->
devdata
,
CudaNdarray_HOST_STRIDES
(
kern
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
kern
)[
3
]
,
out
->
devdata
,
CudaNdarray_HOST_STRIDES
(
out
)[
0
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
1
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
2
]
,
CudaNdarray_HOST_STRIDES
(
out
)[
3
]
,
PyGpuArray_DIMS
(
img
)[
0
],
PyGpuArray
_DIMS
(
kern
)[
0
],
PyGpuArray
_DIMS
(
img
)[
1
],
PyGpuArray_DIMS
(
img
)[
2
],
PyGpuArray
_DIMS
(
img
)[
3
],
PyGpuArray_DIMS
(
kern
)[
2
],
PyGpuArray
_DIMS
(
kern
)[
3
],
PyGpuArray_DIMS
(
out
)[
2
],
PyGpuArray
_DIMS
(
out
)[
3
],
cuda_get_ptr
(
img
),
PyGpuArray_STRIDES
(
img
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
img
)[
3
]
/
4
,
cuda_get_ptr
(
kern
),
PyGpuArray_STRIDES
(
kern
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
kern
)[
3
]
/
4
,
cuda_get_ptr
(
out
),
PyGpuArray_STRIDES
(
out
)[
0
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
1
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
2
]
/
4
,
PyGpuArray_STRIDES
(
out
)[
3
]
/
4
,
subsample_rows
,
subsample_cols
);
CNDA_THREAD_SYNC
;
cudaError_t
sts
=
cudaGetLastError
();
if
(
cudaSuccess
==
sts
)
...
...
@@ -1392,9 +1391,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
}
PyObject
*
CudaNdarray_Conv
(
CudaNdarray
*
img
,
CudaNdarray
*
kern
,
CudaNdarray
*
out
,
const
int
mode
,
const
int
subsample_rows
,
const
in
t
subsample_cols
,
PyGpuArray_Conv
(
PyGpuArrayObject
*
img
,
PyGpuArrayObject
*
kern
,
PyGpuArrayObject
*
out
,
const
int
mode
,
const
size_t
subsample_rows
,
const
size_
t
subsample_cols
,
const
int
version
,
const
int
verbose
,
const
int
max_threads_dim0
=
512
)
...
...
@@ -1402,43 +1401,43 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
//
if
(
img
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
img
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"
CudaNda
rray 4-D tensor required"
);
PyErr_SetString
(
PyExc_ValueError
,
"
PyGpuA
rray 4-D tensor required"
);
return
NULL
;
}
if
(
kern
->
nd
!=
4
)
if
(
PyGpuArray_NDIM
(
kern
)
!=
4
)
{
PyErr_SetString
(
PyExc_ValueError
,
"
CudaNda
rray 4-D tensor required"
);
PyErr_SetString
(
PyExc_ValueError
,
"
PyGpuA
rray 4-D tensor required"
);
return
NULL
;
}
in
t
out_dim
[
4
];
out_dim
[
0
]
=
CudaNdarray_HOST
_DIMS
(
img
)[
0
];
out_dim
[
1
]
=
CudaNdarray_HOST
_DIMS
(
kern
)[
0
];
in
t
logical_rows
,
logical_cols
;
size_
t
out_dim
[
4
];
out_dim
[
0
]
=
PyGpuArray
_DIMS
(
img
)[
0
];
out_dim
[
1
]
=
PyGpuArray
_DIMS
(
kern
)[
0
];
size_
t
logical_rows
,
logical_cols
;
if
(
mode
==
ConvMode_VALID
)
{
logical_rows
=
CudaNdarray_HOST_DIMS
(
img
)[
2
]
-
CudaNdarray_HOST
_DIMS
(
kern
)[
2
]
+
1
;
logical_cols
=
CudaNdarray_HOST_DIMS
(
img
)[
3
]
-
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]
+
1
;
logical_rows
=
PyGpuArray_DIMS
(
img
)[
2
]
-
PyGpuArray
_DIMS
(
kern
)[
2
]
+
1
;
logical_cols
=
PyGpuArray_DIMS
(
img
)[
3
]
-
PyGpuArray
_DIMS
(
kern
)[
3
]
+
1
;
}
else
{
logical_rows
=
CudaNdarray_HOST_DIMS
(
img
)[
2
]
+
CudaNdarray_HOST
_DIMS
(
kern
)[
2
]
-
1
;
logical_cols
=
CudaNdarray_HOST_DIMS
(
img
)[
3
]
+
CudaNdarray_HOST
_DIMS
(
kern
)[
3
]
-
1
;
logical_rows
=
PyGpuArray_DIMS
(
img
)[
2
]
+
PyGpuArray
_DIMS
(
kern
)[
2
]
-
1
;
logical_cols
=
PyGpuArray_DIMS
(
img
)[
3
]
+
PyGpuArray
_DIMS
(
kern
)[
3
]
-
1
;
}
out_dim
[
2
]
=
ceil_intdiv
(
logical_rows
,
subsample_rows
);
out_dim
[
3
]
=
ceil_intdiv
(
logical_cols
,
subsample_cols
);
CudaNdarray
*
rval
=
NULL
;
PyGpuArrayObject
*
rval
=
NULL
;
if
(
out
&&
out
->
nd
==
4
&&
CudaNdarray_is_c_contiguous
(
out
)
&&
CudaNdarray_HOST
_DIMS
(
out
)[
0
]
==
out_dim
[
0
]
&&
CudaNdarray_HOST
_DIMS
(
out
)[
1
]
==
out_dim
[
1
]
&&
CudaNdarray_HOST
_DIMS
(
out
)[
2
]
==
out_dim
[
2
]
&&
CudaNdarray_HOST
_DIMS
(
out
)[
3
]
==
out_dim
[
3
])
&&
PyGpuArray_NDIM
(
out
)
==
4
&&
out
->
ga
.
flags
&
GA_C_CONTIGUOUS
&&
PyGpuArray
_DIMS
(
out
)[
0
]
==
out_dim
[
0
]
&&
PyGpuArray
_DIMS
(
out
)[
1
]
==
out_dim
[
1
]
&&
PyGpuArray
_DIMS
(
out
)[
2
]
==
out_dim
[
2
]
&&
PyGpuArray
_DIMS
(
out
)[
3
]
==
out_dim
[
3
])
{
rval
=
out
;
Py_INCREF
(
rval
);
...
...
@@ -1458,20 +1457,22 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
"INFO: Conv don't have an 'out' argument"
" structure.
\n
"
);
rval
=
(
CudaNdarray
*
)
CudaNdarray_NewDims
(
4
,
out_dim
);
rval
=
pygpu_zeros
(
4
,
out_dim
,
img
->
ga
.
typecode
,
GA_C_ORDER
,
pygpu_default_context
(),
Py_None
);
//rval might be null
}
if
((
rval
==
NULL
)
||
((
mode
==
ConvMode_VALID
)
&&
CudaNda
rray_conv_valid
(
img
,
kern
,
rval
,
subsample_rows
,
subsample_cols
,
version
,
verbose
,
max_threads_dim0
))
||
((
mode
==
ConvMode_FULL
)
&&
CudaNda
rray_conv_full
(
img
,
kern
,
rval
,
subsample_rows
,
subsample_cols
,
version
,
verbose
,
max_threads_dim0
))
||
((
mode
==
ConvMode_VALID
)
&&
PyGpuA
rray_conv_valid
(
img
,
kern
,
rval
,
subsample_rows
,
subsample_cols
,
version
,
verbose
,
max_threads_dim0
))
||
((
mode
==
ConvMode_FULL
)
&&
PyGpuA
rray_conv_full
(
img
,
kern
,
rval
,
subsample_rows
,
subsample_cols
,
version
,
verbose
,
max_threads_dim0
))
)
{
// if rval is something we just allocated,
...
...
theano/sandbox/gpuarray/conv.py
浏览文件 @
5fc89c03
import
copy
import
os
import
theano
from
theano
import
gof
from
theano
import
config
,
gof
from
theano.sandbox.cuda.nvcc_compiler
import
NVCC_compiler
from
theano.sandbox.gpuarray.type
import
GpuArrayType
class
GpuConv
(
gof
.
Op
):
...
...
@@ -114,6 +119,9 @@ class GpuConv(gof.Op):
str
(
self
.
kshp
))
def
make_node
(
self
,
img
,
kern
):
if
img
.
dtype
!=
"float32"
or
kern
.
dtype
!=
"float32"
:
raise
NotImplementedError
(
"GpuConv currently only work"
" with float32 dtype"
)
if
img
.
type
.
ndim
!=
4
:
raise
TypeError
(
'img must be 4D tensor'
)
if
kern
.
type
.
ndim
!=
4
:
...
...
@@ -121,7 +129,8 @@ class GpuConv(gof.Op):
broadcastable
=
[
img
.
type
.
broadcastable
[
0
],
kern
.
type
.
broadcastable
[
0
],
False
,
False
]
return
Apply
(
self
,
[
img
,
kern
],
[
CudaNdarrayType
(
broadcastable
)()])
out
=
GpuArrayType
(
img
.
dtype
,
broadcastable
)()
return
gof
.
Apply
(
self
,
[
img
,
kern
],
[
out
])
def
flops
(
self
,
inputs
,
outputs
):
""" Useful with the hack in profilemode to print the MFlops"""
...
...
@@ -145,6 +154,8 @@ class GpuConv(gof.Op):
def
make_thunk
(
self
,
node
,
storage_map
,
compute_map
,
no_recycling
):
node_
=
copy
.
copy
(
node
)
assert
node
.
op
is
node_
.
op
if
config
.
gpuarray
.
sync
:
raise
NotImplementedError
(
"GpuConv do not implement gpuarray.sync Theano flag"
)
if
node_
.
op
.
max_threads_dim0
is
None
:
cuda
=
theano
.
sandbox
.
cuda
device_id
=
cuda
.
use
.
device_number
...
...
@@ -169,20 +180,30 @@ class GpuConv(gof.Op):
return
[
'-DTHEANO_KERN_WID='
+
str
(
nb
)]
# ,'-g','-G']
def
c_headers
(
self
):
return
[
'cuda_ndarray.cuh'
,
'<stdio.h>'
]
return
[
'<stdio.h>'
,
'cuda.h'
,
'<compyte/extension.h>'
,
'<compyte/numpy_compat.h>'
]
def
c_code_cache_version
(
self
):
# raise this whenever modifying any of the support_code_files
return
(
0
,
20
)
def
c_init_code
(
self
):
return
[
'cuda_get_ptr_raw = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'
]
def
c_support_code_apply
(
self
,
node
,
nodename
):
# REMEMBER TO RAISE c_code_cache_version when changing any of
# these files
files
=
[
'conv_kernel.cu'
,
'conv_full_kernel.cu'
,
'conv.cu'
]
codes
=
[
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
files
]
codes
=
[
"CUdeviceptr (*cuda_get_ptr_raw)(gpudata *g);"
,
"float* cuda_get_ptr(PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"
,
"const float* cuda_get_ptr(const PyGpuArrayObject * o){return (float*) cuda_get_ptr_raw(o->ga.data);}"
]
codes
+=
[
open
(
os
.
path
.
join
(
os
.
path
.
split
(
__file__
)[
0
],
f
))
.
read
()
for
f
in
files
]
return
reduce
(
str
.
__add__
,
codes
)
def
c_compiler
(
self
):
return
NVCC_compiler
def
c_code
(
self
,
node
,
nodename
,
inp
,
out_
,
sub
):
img
,
kern
=
inp
out
,
=
out_
...
...
@@ -226,7 +247,8 @@ class GpuConv(gof.Op):
}
// TODO, make out be decref before we alloc out2!
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(
%(img)
s,
%(kern)
s,
PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
%(img)
s,
%(kern)
s,
%(out)
s, mode,
dx, dy,
version, verbose,
...
...
theano/sandbox/gpuarray/conv_full_kernel.cu
浏览文件 @
5fc89c03
...
...
@@ -4,7 +4,8 @@
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
conv_full_patch_split
(
float
*
img
,
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nb_split
)
conv_full_patch_split
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nb_split
)
{
int
__shared__
out_len
,
out_wid
,
nb_thread_id
;
out_len
=
img_len
+
kern_len
-
1
;
...
...
@@ -60,7 +61,7 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__
void
conv_full_patch
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
{
...
...
@@ -122,7 +123,7 @@ conv_full_patch( float* img, float* kern, float* out,
template
<
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
>
__global__
void
conv_full_patch_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -133,7 +134,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
out_len
=
img_len
+
kern_len
-
1
;
out_wid
=
img_wid
+
kern_wid
-
1
;
nb_thread_id
=
blockDim
.
y
*
blockDim
.
x
;
//blockDim.z*
float
__shared__
*
kern_
,
*
img_
;
const
float
__shared__
*
kern_
,
*
img_
;
extern
__shared__
float
s_data
[];
const
int
batch_id
=
blockIdx
.
x
;
...
...
@@ -201,7 +202,7 @@ conv_full_patch_stack( float* img, float* kern, float* out,
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
low_mem
>
__global__
void
conv_full_patch_stack_padded
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_patch_stack_padded
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
...
...
@@ -365,7 +366,7 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
}
template
<
int
NSTACK
>
__global__
void
conv_full_load_everything
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_full_load_everything
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
theano/sandbox/gpuarray/conv_kernel.cu
浏览文件 @
5fc89c03
...
...
@@ -221,7 +221,7 @@ __device__ void store_or_accumulate(float& dst,const float value ){
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
split
>
__global__
void
conv_patch
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
)
{
...
...
@@ -304,7 +304,7 @@ conv_patch( float* img, float* kern, float* out,
*/
template
<
bool
flipped_kern
,
bool
accumulate
,
int
KERN_WIDTH
,
bool
img_c_contiguous_2d
,
bool
kern_c_contiguous_2d
,
bool
split
,
bool
preload_full_kern
,
bool
subsample
>
__global__
void
conv_patch_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -375,7 +375,7 @@ conv_patch_stack( float* img, float* kern, float* out,
out_row
*
out_wid
+
out_col
],
sum
);
}
else
{
float
__shared__
*
kern_
,
*
img_
;
const
float
__shared__
*
kern_
,
*
img_
;
int
__shared__
out_len_max
;
kern_
=
kern
+
kern_stride_nkern
*
kern_id
;
//the good nkern
...
...
@@ -456,7 +456,7 @@ conv_patch_stack( float* img, float* kern, float* out,
*/
template
<
bool
flipped_kern
,
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
split
,
bool
preload_full_kern
>
__global__
void
conv_patch_stack_reduce
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_patch_stack_reduce
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
int
img_stride_stack
,
int
img_stride_batch
,
...
...
@@ -572,7 +572,7 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
__global__
void
conv_rows
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
nkern
,
int
nstack
,
int
img_stride_col
,
int
img_stride_row
,
...
...
@@ -633,7 +633,7 @@ conv_rows( float* img, float* kern, float* out,
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
>
__global__
void
conv_rows_stack
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows_stack
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
...
@@ -731,7 +731,7 @@ conv_rows_stack( float* img, float* kern, float* out,
*/
template
<
int
KERN_WIDTH
,
bool
c_contiguous
,
bool
preload_full_kern
>
__global__
void
conv_rows_stack2
(
float
*
img
,
float
*
kern
,
float
*
out
,
conv_rows_stack2
(
const
float
*
img
,
const
float
*
kern
,
float
*
out
,
const
int
img_len
,
const
int
img_wid
,
const
int
kern_len
,
const
int
kern_wid
,
const
int
nkern
,
const
int
nstack
,
const
int
img_stride_col
,
const
int
img_stride_row
,
...
...
@@ -831,8 +831,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
,
const
int
initial_reduce_boundary
)
...
...
@@ -859,8 +859,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
float
sum
=
0.0
f
;
if
(
stack_loop
){
for
(;
ss
<
stacklen
;
ss
+=
blockDim
.
x
){
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
const
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
const
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
{
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
...
...
@@ -869,8 +869,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
}
}
}
else
{
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
const
float
*
kk_0
=
kern
+
iK
*
kern_str_K
+
ss
*
kern_str_S
+
rr
*
kern_str_R
;
const
float
*
ii_0
=
img
+
iB
*
img_str_B
+
ss
*
img_str_S
+
img_rr
*
img_str_R
+
(
iC_logical
+
kern_wid
-
1
)
*
img_str_C
;
for
(
int
cc
=
0
;
cc
<
kern_wid
;
++
cc
)
{
sum
+=
kk_0
[
0
]
*
ii_0
[
0
];
...
...
@@ -925,8 +925,8 @@ conv_reference_valid(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
{
...
...
@@ -984,8 +984,8 @@ conv_reference_full(int nB, int nK, int stacklen,
int
img_len
,
int
img_wid
,
int
kern_len
,
int
kern_wid
,
int
out_len
,
int
out_wid
,
//physical dimensions
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
const
float
*
img
,
int
img_str_B
,
int
img_str_S
,
int
img_str_R
,
int
img_str_C
,
const
float
*
kern
,
int
kern_str_K
,
int
kern_str_S
,
int
kern_str_R
,
int
kern_str_C
,
float
*
out
,
int
out_str_B
,
int
out_str_K
,
int
out_str_R
,
int
out_str_C
,
int
subsample_rows
,
int
subsample_cols
)
{
...
...
theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
浏览文件 @
5fc89c03
...
...
@@ -25,6 +25,7 @@ from theano.tests.unittest_tools import seed_rng
from
theano.sandbox.gpuarray.tests.test_basic_ops
import
(
mode_with_gpu
,
mode_without_gpu
)
from
theano.sandbox.gpuarray.type
import
GpuArrayType
from
theano.sandbox.gpuarray.conv
import
GpuConv
import
pygpu
gftensor4
=
GpuArrayType
(
'float32'
,
[
False
]
*
4
)
...
...
@@ -159,11 +160,11 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
t1
=
time
.
time
()
i
=
gftensor4
()
k
=
gftensor4
()
op
=
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
(
border_mode
=
mode
,
subsample
=
subsample
,
version
=
version
,
verbose
=
verbose
,
kshp
=
compile_kshp
)(
i
,
k
)
op
=
GpuConv
(
border_mode
=
mode
,
subsample
=
subsample
,
version
=
version
,
verbose
=
verbose
,
kshp
=
compile_kshp
)(
i
,
k
)
f
=
theano
.
function
([
i
,
k
],
op
,
mode
=
mode_with_gpu
)
gpuval
=
f
(
img
,
kern
)
t2
=
time
.
time
()
...
...
@@ -731,7 +732,7 @@ class TestConv2DGPU(unittest.TestCase):
func
=
theano
.
function
([
a
,
A
],
image_estimate
,
mode
=
mode_with_gpu
)
#theano.printing.debugprint(func,)
assert
any
([
isinstance
(
node
.
op
,
theano
.
sandbox
.
cuda
.
blas
.
GpuConv
)
assert
any
([
isinstance
(
node
.
op
,
GpuConv
)
for
node
in
func
.
maker
.
fgraph
.
toposort
()])
a_in
=
numpy
.
random
.
randn
(
*
featshp
)
.
astype
(
"float32"
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论