Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
P
pytensor
项目
项目
详情
活动
周期分析
仓库
仓库
文件
提交
分支
标签
贡献者
图表
比较
统计图
议题
0
议题
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
CI / CD
CI / CD
流水线
作业
日程
统计图
Wiki
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
testgroup
pytensor
Commits
42f4cb3e
提交
42f4cb3e
authored
7月 22, 2014
作者:
Arnaud Bergeron
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add batch support to blocksparse.
上级
47d59687
显示空白字符变更
内嵌
并排
正在显示
2 个修改的文件
包含
192 行增加
和
141 行删除
+192
-141
blocksparse.py
theano/sandbox/cuda/blocksparse.py
+162
-117
test_blocksparse.py
theano/sandbox/cuda/tests/test_blocksparse.py
+30
-24
没有找到文件。
theano/sandbox/cuda/blocksparse.py
浏览文件 @
42f4cb3e
...
@@ -74,24 +74,29 @@ def ger(alpha, x, y, A):
...
@@ -74,24 +74,29 @@ def ger(alpha, x, y, A):
class
SparseBlockGemvSS
(
GpuOp
):
class
SparseBlockGemvSS
(
GpuOp
):
def
__init__
(
self
,
inplace
=
False
):
self
.
inplace
=
inplace
if
self
.
inplace
:
self
.
destroy_map
=
{
0
:
[
0
]}
def
__eq__
(
self
,
other
):
def
__eq__
(
self
,
other
):
return
type
(
self
)
==
type
(
other
)
return
type
(
self
)
==
type
(
other
)
and
self
.
inplace
==
other
.
inplace
def
__hash__
(
self
):
def
__hash__
(
self
):
return
hash
(
type
(
self
))
return
hash
(
type
(
self
))
^
hash
(
self
.
inplace
)
def
__str__
(
self
):
def
__str__
(
self
):
return
"SparseBlockGemvSS
"
return
"SparseBlockGemvSS
%
s"
%
(
"{inplace}"
if
self
.
inplace
else
""
)
def
make_node
(
self
,
o
,
W
,
h
,
inputIdx
,
outputIdx
):
def
make_node
(
self
,
o
,
W
,
h
,
inputIdx
,
outputIdx
):
o
=
basic_ops
.
as_cuda_ndarray_variable
(
o
)
o
=
basic_ops
.
as_cuda_ndarray_variable
(
o
)
W
=
basic_ops
.
as_cuda_ndarray_variable
(
W
)
W
=
basic_ops
.
as_cuda_ndarray_variable
(
W
)
h
=
basic_ops
.
as_cuda_ndarray_variable
(
h
)
h
=
basic_ops
.
as_cuda_ndarray_variable
(
h
)
assert
o
.
ndim
==
2
assert
o
.
ndim
==
3
assert
W
.
ndim
==
4
assert
W
.
ndim
==
4
assert
h
.
ndim
==
2
assert
h
.
ndim
==
3
assert
inputIdx
.
ndim
==
1
assert
inputIdx
.
ndim
==
2
assert
outputIdx
.
ndim
==
1
assert
outputIdx
.
ndim
==
2
assert
'int'
in
inputIdx
.
type
.
dtype
assert
'int'
in
inputIdx
.
type
.
dtype
assert
'int'
in
outputIdx
.
type
.
dtype
assert
'int'
in
outputIdx
.
type
.
dtype
...
@@ -101,7 +106,6 @@ class SparseBlockGemvSS(GpuOp):
...
@@ -101,7 +106,6 @@ class SparseBlockGemvSS(GpuOp):
def
c_support_code
(
self
):
def
c_support_code
(
self
):
return
"""
return
"""
// This is NOT batch-ready
__global__ void
__global__ void
SparseBlockGemv_fill_lists(
SparseBlockGemv_fill_lists(
int n,
int n,
...
@@ -109,18 +113,38 @@ const float **inp_list,
...
@@ -109,18 +113,38 @@ const float **inp_list,
float **out_list,
float **out_list,
const float **W_list,
const float **W_list,
const float *W, int W_str_0, int W_str_1,
const float *W, int W_str_0, int W_str_1,
const float *h, int h_str_0,
const float *h, int h_str_0,
int h_str_1,
float *outB, int o_str_0, int o_str_1,
float *outB, int o_str_0, int o_str_1,
int o_str_2,
const npy_intp *iIdx,
const npy_intp *iIdx,
int iI_str_0,
const npy_intp *oIdx
const npy_intp *oIdx
, int oI_str_0
) {
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int p = i + j * blockDim.x * gridDim.x;
int b = threadIdx.z + blockDim.z * blockIdx.z;
int p = i + j * blockDim.x * gridDim.x +
b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
if (p >= n) return;
if (p >= n) return;
inp_list[p] = &h[i * h_str_0];
inp_list[p] = &h[b * h_str_0 + i * h_str_1];
out_list[p] = &outB[i * o_str_0 + j * o_str_1];
out_list[p] = &outB[b * o_str_0 + i * o_str_1 + j * o_str_2];
W_list[p] = &W[iIdx[i] * W_str_0 + oIdx[j] * W_str_1];
W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
oIdx[b*oI_str_0+j] * W_str_1];
}
__global__ void
SparseBlockGemv_reduce(
int red_dim,
float *outB, int i_str_0, int i_str_1, int i_str_2, int i_str_3,
float *out, int o_str_0, int o_str_1, int o_str_2
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = threadIdx.z + blockDim.z * blockIdx.z;
float s = 0.0;
float *oB = &outB[b * i_str_0 + i * i_str_2 + j * i_str_3];
for (int k = 0; k < red_dim; k++) {
s += oB[k * i_str_1];
}
out[b * o_str_0 + i * o_str_1 + j * o_str_2] += s;
}
}
static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
...
@@ -152,7 +176,6 @@ const npy_intp *oIdx
...
@@ -152,7 +176,6 @@ const npy_intp *oIdx
static npy_intp *
%(n)
s_oIdx;
static npy_intp *
%(n)
s_oIdx;
static size_t
%(n)
s_oIdx_len;
static size_t
%(n)
s_oIdx_len;
// This is batch-ready
static int
%(n)
s_prep(int b, int i, int j, int outsize) {
static int
%(n)
s_prep(int b, int i, int j, int outsize) {
int s = b*i*j;
int s = b*i*j;
if (
%(n)
s_list_len < s) {
if (
%(n)
s_list_len < s) {
...
@@ -187,21 +210,24 @@ const npy_intp *oIdx
...
@@ -187,21 +210,24 @@ const npy_intp *oIdx
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
out
=
outputs
[
0
]
out
=
outputs
[
0
]
dd
=
(
o
.
shape
[
0
]
*
h
.
shape
[
0
],)
dd
=
(
o
.
shape
[
0
]
*
o
.
shape
[
1
]
*
h
.
shape
[
1
],)
weightHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
weightHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
outputHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
outputHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
inputHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
inputHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
outputBatched
=
pycuda
.
gpuarray
.
GPUArray
((
h
.
shape
[
0
],
o
.
shape
[
0
],
o
.
shape
[
1
]),
dtype
=
'float32'
)
outputBatched
=
pycuda
.
gpuarray
.
GPUArray
((
h
.
shape
[
0
],
h
.
shape
[
1
],
o
.
shape
[
1
],
o
.
shape
[
2
]),
dtype
=
'float32'
)
k
=
0
k
=
0
for
j
in
range
(
o
.
shape
[
0
]):
for
b
in
range
(
o
.
shape
[
0
]):
out_id
=
outputIdx
[
j
]
for
j
in
range
(
o
.
shape
[
1
]):
for
i
in
range
(
h
.
shape
[
0
]):
out_id
=
outputIdx
[
b
,
j
]
inp_id
=
inputIdx
[
i
]
for
i
in
range
(
h
.
shape
[
1
]):
inp_id
=
inputIdx
[
b
,
i
]
weightHostB
[
k
]
=
W
[
inp_id
,
out_id
]
.
gpudata
weightHostB
[
k
]
=
W
[
inp_id
,
out_id
]
.
gpudata
outputHostB
[
k
]
=
outputBatched
[
i
,
j
]
.
ptr
outputHostB
[
k
]
=
outputBatched
[
b
,
i
,
j
]
.
ptr
inputHostB
[
k
]
=
h
[
i
]
.
gpudata
inputHostB
[
k
]
=
h
[
b
,
i
]
.
gpudata
k
+=
1
k
+=
1
weightB
=
pycuda
.
gpuarray
.
to_gpu
(
weightHostB
)
weightB
=
pycuda
.
gpuarray
.
to_gpu
(
weightHostB
)
...
@@ -215,13 +241,13 @@ const npy_intp *oIdx
...
@@ -215,13 +241,13 @@ const npy_intp *oIdx
lda
=
W
.
strides
[
3
]
lda
=
W
.
strides
[
3
]
gemm_batched
(
tA
,
'n'
,
o
.
shape
[
1
],
1
,
h
.
shape
[
1
],
gemm_batched
(
tA
,
'n'
,
o
.
shape
[
2
],
1
,
h
.
shape
[
2
],
weightB
,
lda
,
inputB
,
h
.
strides
[
0
],
weightB
,
lda
,
inputB
,
h
.
strides
[
1
],
outputB
,
o
.
strides
[
0
],
outputB
,
o
.
strides
[
1
],
beta
=
numpy
.
asarray
(
0.0
,
dtype
=
'float32'
))
beta
=
numpy
.
asarray
(
0.0
,
dtype
=
'float32'
))
outputBatchedG
=
to_cudandarray
(
outputBatched
)
outputBatchedG
=
to_cudandarray
(
outputBatched
)
out
[
0
]
=
o
+
outputBatchedG
.
reduce_sum
([
1
,
0
,
0
])
out
[
0
]
=
o
+
outputBatchedG
.
reduce_sum
([
0
,
1
,
0
,
0
])
def
infer_shape
(
self
,
node
,
input_shapes
):
def
infer_shape
(
self
,
node
,
input_shapes
):
return
[
input_shapes
[
0
]]
return
[
input_shapes
[
0
]]
...
@@ -230,27 +256,34 @@ const npy_intp *oIdx
...
@@ -230,27 +256,34 @@ const npy_intp *oIdx
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
out
=
outputs
[
0
]
out
=
outputs
[
0
]
return
"""
if
self
.
inplace
:
if (
%(name)
s_prep(1, // NOT batch-ready
res
=
"""
CudaNdarray_HOST_DIMS(
%(h)
s)[0],
Py_XDECREF(
%(out)
s);
CudaNdarray_HOST_DIMS(
%(o)
s)[0],
%(out)
s =
%(o)
s;
CudaNdarray_HOST_DIMS(
%(o)
s)[1]) == -1) {
Py_INCREF(
%(out)
s);
"""
%
dict
(
out
=
out
,
o
=
o
)
else
:
res
=
"""
if (CudaNdarray_prep_output(&
%(out)
s, 3, CudaNdarray_HOST_DIMS(
%(o)
s)))
{
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
%(fail)
s
}
if (CudaNdarray_CopyFromCudaNdarray(
%(out)
s,
%(o)
s)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy data to output");
%(fail)
s
}
"""
%
dict
(
out
=
out
,
o
=
o
,
fail
=
sub
[
'fail'
])
return
res
+
"""
if (
%(name)
s_prep(CudaNdarray_HOST_DIMS(
%(o)
s)[0],
CudaNdarray_HOST_DIMS(
%(h)
s)[1],
CudaNdarray_HOST_DIMS(
%(o)
s)[1],
CudaNdarray_HOST_DIMS(
%(o)
s)[2]) == -1) {
PyErr_SetString(PyExc_RuntimeError,
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory.");
"Could not allocate working memory.");
%(fail)
s
%(fail)
s
}
}
{
// NOT batch-ready
int dims[3];
dims[0] = 1; // This is to facilitate the reduction at the end.
dims[1] = CudaNdarray_HOST_DIMS(
%(o)
s)[0];
dims[2] = CudaNdarray_HOST_DIMS(
%(o)
s)[1];
if (CudaNdarray_prep_output(&
%(out)
s, 3, dims)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
%(fail)
s
}
}
// This is batch-ready
if (SparseBlockGemv_copy(
%(inputIdx)
s,
%(name)
s_iIdx) == -1)
if (SparseBlockGemv_copy(
%(inputIdx)
s,
%(name)
s_iIdx) == -1)
{
%(fail)
s }
{
%(fail)
s }
if (SparseBlockGemv_copy(
%(outputIdx)
s,
%(name)
s_oIdx) == -1)
if (SparseBlockGemv_copy(
%(outputIdx)
s,
%(name)
s_oIdx) == -1)
...
@@ -258,21 +291,23 @@ const npy_intp *oIdx
...
@@ -258,21 +291,23 @@ const npy_intp *oIdx
{ /* Prepare lists for the batch */
{ /* Prepare lists for the batch */
// NOT batch-ready
// NOT batch-ready
dim3 block;
dim3 block;
block.x = CudaNdarray_HOST_DIMS(
%(h)
s)[0];
block.x = CudaNdarray_HOST_DIMS(
%(h)
s)[1];
block.y = CudaNdarray_HOST_DIMS(
%(o)
s)[0];
block.y = CudaNdarray_HOST_DIMS(
%(o)
s)[1];
block.z = CudaNdarray_HOST_DIMS(
%(o)
s)[0]; // batch size
SparseBlockGemv_fill_lists<<<block, 1>>>(
SparseBlockGemv_fill_lists<<<block, 1>>>(
block.x*block.y,
block.x*block.y
*block.z
,
%(name)
s_inp_list,
%(name)
s_inp_list,
%(name)
s_out_list,
%(name)
s_out_list,
%(name)
s_W_list,
%(name)
s_W_list,
CudaNdarray_DEV_DATA(
%(W)
s),
CudaNdarray_DEV_DATA(
%(W)
s),
CudaNdarray_HOST_STRIDES(
%(W)
s)[0], CudaNdarray_HOST_STRIDES(
%(W)
s)[1],
CudaNdarray_HOST_STRIDES(
%(W)
s)[0], CudaNdarray_HOST_STRIDES(
%(W)
s)[1],
CudaNdarray_DEV_DATA(
%(h)
s), CudaNdarray_HOST_STRIDES(
%(h)
s)[0],
CudaNdarray_DEV_DATA(
%(h)
s), CudaNdarray_HOST_STRIDES(
%(h)
s)[0],
CudaNdarray_HOST_STRIDES(
%(h)
s)[1],
%(name)
s_outB,
%(name)
s_outB,
CudaNdarray_HOST_DIMS(
%(o)
s)[0] * CudaNdarray_HOST_DIMS(
%(o)
s)[1],
CudaNdarray_HOST_DIMS(
%(h)
s)[1] * CudaNdarray_HOST_DIMS(
%(o)
s)[1] * CudaNdarray_HOST_DIMS(
%(o)
s)[2],
CudaNdarray_HOST_DIMS(
%(o)
s)[1],
CudaNdarray_HOST_DIMS(
%(o)
s)[1] * CudaNdarray_HOST_DIMS(
%(o)
s)[2],
%(name)
s_iIdx,
CudaNdarray_HOST_DIMS(
%(o)
s)[2],
%(name)
s_oIdx);
%(name)
s_iIdx, PyArray_DIM(
%(inputIdx)
s, 1),
%(name)
s_oIdx, PyArray_DIM(
%(outputIdx)
s, 1));
}
}
{ /* Run SgemmBatched */
{ /* Run SgemmBatched */
float alpha = 1.0;
float alpha = 1.0;
...
@@ -285,50 +320,46 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
...
@@ -285,50 +320,46 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
lda = CudaNdarray_HOST_STRIDES(
%(W)
s)[3];
lda = CudaNdarray_HOST_STRIDES(
%(W)
s)[3];
}
}
err = cublasSgemmBatched(handle, transA, CUBLAS_OP_N,
err = cublasSgemmBatched(handle, transA, CUBLAS_OP_N,
CudaNdarray_HOST_DIMS(
%(o)
s)[
1
], 1,
CudaNdarray_HOST_DIMS(
%(o)
s)[
2
], 1,
CudaNdarray_HOST_DIMS(
%(h)
s)[
1
], &alpha,
CudaNdarray_HOST_DIMS(
%(h)
s)[
2
], &alpha,
%(name)
s_W_list, lda,
%(name)
s_inp_list,
%(name)
s_W_list, lda,
%(name)
s_inp_list,
CudaNdarray_HOST_STRIDES(
%(h)
s)[
0
],
CudaNdarray_HOST_STRIDES(
%(h)
s)[
1
],
&beta,
%(name)
s_out_list,
&beta,
%(name)
s_out_list,
CudaNdarray_HOST_STRIDES(
%(o)
s)[0],
CudaNdarray_HOST_STRIDES(
%(o)
s)[1],
CudaNdarray_HOST_DIMS(
%(o)
s)[0] *
CudaNdarray_HOST_DIMS(
%(o)
s)[1] *
CudaNdarray_HOST_DIMS(
%(h)
s)[0]);
CudaNdarray_HOST_DIMS(
%(h)
s)[1] *
CudaNdarray_HOST_DIMS(
%(o)
s)[0]);
if (err != CUBLAS_STATUS_SUCCESS) {
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
%(fail)
s
%(fail)
s
}
}
}
}
{ /* Perform final reduction and add biases */
{ /* Perform final reduction and add biases */
CudaNdarray *tmp;
dim3 block;
int p[2];
block.x = CudaNdarray_HOST_DIMS(
%(o)
s)[1];
p[0] = 1;
block.y = CudaNdarray_HOST_DIMS(
%(o)
s)[2];
p[1] = 2;
block.z = CudaNdarray_HOST_DIMS(
%(o)
s)[0];
tmp = (CudaNdarray *)CudaNdarray_new_nd(3);
SparseBlockGemv_reduce<<<block, 1>>>(
if (tmp == NULL) {
%(fail)
s }
CudaNdarray_HOST_DIMS(
%(h)
s)[1],
CudaNdarray_set_dim(tmp, 0, CudaNdarray_HOST_DIMS(
%(h)
s)[0]);
%(name)
s_outB,
CudaNdarray_set_stride(tmp, 0, CudaNdarray_HOST_DIMS(
%(o)
s)[0] *
CudaNdarray_HOST_DIMS(
%(h)
s)[1] *
CudaNdarray_HOST_DIMS(
%(o)
s)[1]);
CudaNdarray_HOST_DIMS(
%(o)
s)[1] *
CudaNdarray_set_dim(tmp, 1, CudaNdarray_HOST_DIMS(
%(o)
s)[0]);
CudaNdarray_HOST_DIMS(
%(o)
s)[2],
CudaNdarray_set_stride(tmp, 1, CudaNdarray_HOST_DIMS(
%(o)
s)[1]);
CudaNdarray_HOST_DIMS(
%(o)
s)[1] *
CudaNdarray_set_dim(tmp, 2, CudaNdarray_HOST_DIMS(
%(o)
s)[1]);
CudaNdarray_HOST_DIMS(
%(o)
s)[2],
CudaNdarray_set_stride(tmp, 2, 1);
CudaNdarray_HOST_DIMS(
%(o)
s)[2],
CudaNdarray_set_device_data(tmp,
%(name)
s_outB, (PyObject *)NULL);
1,
if (CudaNdarray_reduce_sum(
%(out)
s, tmp) ||
CudaNdarray_DEV_DATA(
%(out)
s),
CudaNdarray_dimshuffle(
%(out)
s, 2, p)) {
CudaNdarray_HOST_STRIDES(
%(out)
s)[0],
Py_DECREF(tmp);
CudaNdarray_HOST_STRIDES(
%(out)
s)[1],
%(fail)
s;
CudaNdarray_HOST_STRIDES(
%(out)
s)[2]);
}
Py_DECREF(tmp);
if (CudaNdarray_inplace_add((PyObject *)
%(out)
s, (PyObject *)
%(o)
s) == NULL) {
%(fail)
s;
}
}
}
// And we're done!
// And we're done!
"""
%
dict
(
out
=
out
,
h
=
h
,
o
=
o
,
inputIdx
=
inputIdx
,
outputIdx
=
outputIdx
,
"""
%
dict
(
out
=
out
,
h
=
h
,
o
=
o
,
inputIdx
=
inputIdx
,
outputIdx
=
outputIdx
,
W
=
W
,
fail
=
sub
[
'fail'
],
name
=
nodename
)
W
=
W
,
fail
=
sub
[
'fail'
],
name
=
nodename
)
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
3
,)
return
(
5
,)
def
grad
(
self
,
inputs
,
grads
):
def
grad
(
self
,
inputs
,
grads
):
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
o
,
W
,
h
,
inputIdx
,
outputIdx
=
inputs
...
@@ -348,7 +379,8 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
...
@@ -348,7 +379,8 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
"grad of outputIdx makes no sense"
)]
"grad of outputIdx makes no sense"
)]
sparse_block_gemv_ss
=
SparseBlockGemvSS
()
sparse_block_gemv_ss
=
SparseBlockGemvSS
(
False
)
sparse_block_gemv_ss_inplace
=
SparseBlockGemvSS
(
True
)
class
SparseBlockOuterSS
(
GpuOp
):
class
SparseBlockOuterSS
(
GpuOp
):
...
@@ -385,27 +417,28 @@ class SparseBlockOuterSS(GpuOp):
...
@@ -385,27 +417,28 @@ class SparseBlockOuterSS(GpuOp):
if
not
self
.
inplace
:
if
not
self
.
inplace
:
o
=
o
.
copy
()
o
=
o
.
copy
()
dd
=
(
x
.
shape
[
0
]
*
y
.
shape
[
0
],)
dd
=
(
x
.
shape
[
0
]
*
x
.
shape
[
1
]
*
y
.
shape
[
1
],)
xHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
xHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
yHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
yHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
outHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
outHostB
=
numpy
.
empty
(
dd
,
dtype
=
'intp'
)
k
=
0
k
=
0
for
j
in
range
(
y
.
shape
[
0
]):
for
b
in
range
(
x
.
shape
[
0
]):
out_id
=
yIdx
[
j
]
for
j
in
range
(
y
.
shape
[
1
]):
for
i
in
range
(
x
.
shape
[
0
]):
out_id
=
yIdx
[
b
,
j
]
inp_id
=
xIdx
[
i
]
for
i
in
range
(
x
.
shape
[
1
]):
inp_id
=
xIdx
[
b
,
i
]
outHostB
[
k
]
=
o
[
inp_id
,
out_id
]
.
gpudata
outHostB
[
k
]
=
o
[
inp_id
,
out_id
]
.
gpudata
xHostB
[
k
]
=
x
[
i
]
.
gpudata
xHostB
[
k
]
=
x
[
b
,
i
]
.
gpudata
yHostB
[
k
]
=
y
[
j
]
.
gpudata
yHostB
[
k
]
=
y
[
b
,
j
]
.
gpudata
k
+=
1
k
+=
1
xB
=
pycuda
.
gpuarray
.
to_gpu
(
xHostB
)
xB
=
pycuda
.
gpuarray
.
to_gpu
(
xHostB
)
yB
=
pycuda
.
gpuarray
.
to_gpu
(
yHostB
)
yB
=
pycuda
.
gpuarray
.
to_gpu
(
yHostB
)
outB
=
pycuda
.
gpuarray
.
to_gpu
(
outHostB
)
outB
=
pycuda
.
gpuarray
.
to_gpu
(
outHostB
)
gemm_batched
(
'n'
,
't'
,
y
.
shape
[
1
],
x
.
shape
[
1
],
1
,
gemm_batched
(
'n'
,
't'
,
y
.
shape
[
2
],
x
.
shape
[
2
],
1
,
yB
,
y
.
strides
[
0
],
xB
,
x
.
strides
[
0
],
yB
,
y
.
strides
[
1
],
xB
,
x
.
strides
[
1
],
outB
,
o
.
strides
[
2
],
outB
,
o
.
strides
[
2
],
alpha
=
alpha
,
beta
=
beta
)
alpha
=
alpha
,
beta
=
beta
)
...
@@ -422,19 +455,22 @@ int n,
...
@@ -422,19 +455,22 @@ int n,
const float **x_list,
const float **x_list,
const float **y_list,
const float **y_list,
float **out_list,
float **out_list,
const float *x, int x_str_0,
const float *x, int x_str_0,
int x_str_1,
const float *y, int y_str_0,
const float *y, int y_str_0,
int y_str_1,
float *out, int o_str_0, int o_str_1,
float *out, int o_str_0, int o_str_1,
const npy_intp *xIdx,
const npy_intp *xIdx,
int xI_str_0,
const npy_intp *yIdx
const npy_intp *yIdx
, int yI_str_0
) {
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int p = i + j * blockDim.x * gridDim.x;
int b = threadIdx.z + blockDim.z * blockIdx.z;
int p = i + j * blockDim.x * gridDim.x +
b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
if (p >= n) return;
if (p >= n) return;
x_list[p] = &x[i * x_str_0];
x_list[p] = &x[b * x_str_0 + i * x_str_1];
y_list[p] = &y[j * y_str_0];
y_list[p] = &y[b * x_str_0 + j * y_str_1];
out_list[p] = &out[xIdx[i] * o_str_0 + yIdx[j] * o_str_1];
out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
yIdx[b * yI_str_0 + j] * o_str_1];
}
}
static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
...
@@ -464,7 +500,6 @@ static size_t %(n)s_xIdx_len;
...
@@ -464,7 +500,6 @@ static size_t %(n)s_xIdx_len;
static npy_intp *
%(n)
s_yIdx;
static npy_intp *
%(n)
s_yIdx;
static size_t
%(n)
s_yIdx_len;
static size_t
%(n)
s_yIdx_len;
// This is batch-ready
static int
%(n)
s_prep(int b, int i, int j) {
static int
%(n)
s_prep(int b, int i, int j) {
int s = b*i*j;
int s = b*i*j;
if (
%(n)
s_list_len < s) {
if (
%(n)
s_list_len < s) {
...
@@ -515,8 +550,9 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
...
@@ -515,8 +550,9 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
"""
%
dict
(
out
=
out
,
o
=
o
,
fail
=
sub
[
'fail'
])
"""
%
dict
(
out
=
out
,
o
=
o
,
fail
=
sub
[
'fail'
])
return
res
+
"""
return
res
+
"""
if (
%(name)
s_prep(1, CudaNdarray_HOST_DIMS(
%(x)
s)[0],
if (
%(name)
s_prep(CudaNdarray_HOST_DIMS(
%(x)
s)[0],
CudaNdarray_HOST_DIMS(
%(y)
s)[0]) == -1) {
CudaNdarray_HOST_DIMS(
%(x)
s)[1],
CudaNdarray_HOST_DIMS(
%(y)
s)[1]) == -1) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
%(fail)
s
%(fail)
s
}
}
...
@@ -526,29 +562,32 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
...
@@ -526,29 +562,32 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
{
%(fail)
s }
{
%(fail)
s }
{
{
dim3 block;
dim3 block;
block.x = CudaNdarray_HOST_DIMS(
%(x)
s)[0];
block.x = CudaNdarray_HOST_DIMS(
%(x)
s)[1];
block.y = CudaNdarray_HOST_DIMS(
%(y)
s)[0];
block.y = CudaNdarray_HOST_DIMS(
%(y)
s)[1];
block.z = CudaNdarray_HOST_DIMS(
%(x)
s)[0];
SparseBlockOuter_fill_lists<<<block, 1>>>(
SparseBlockOuter_fill_lists<<<block, 1>>>(
block.x * block.y,
block.x * block.y
* block.z
,
%(name)
s_x_list,
%(name)
s_x_list,
%(name)
s_y_list,
%(name)
s_y_list,
%(name)
s_out_list,
%(name)
s_out_list,
CudaNdarray_DEV_DATA(
%(x)
s), CudaNdarray_HOST_STRIDES(
%(x)
s)[0],
CudaNdarray_DEV_DATA(
%(x)
s), CudaNdarray_HOST_STRIDES(
%(x)
s)[0],
CudaNdarray_HOST_STRIDES(
%(x)
s)[1],
CudaNdarray_DEV_DATA(
%(y)
s), CudaNdarray_HOST_STRIDES(
%(y)
s)[0],
CudaNdarray_DEV_DATA(
%(y)
s), CudaNdarray_HOST_STRIDES(
%(y)
s)[0],
CudaNdarray_HOST_STRIDES(
%(y)
s)[1],
CudaNdarray_DEV_DATA(
%(out)
s),
CudaNdarray_DEV_DATA(
%(out)
s),
CudaNdarray_HOST_STRIDES(
%(out)
s)[0], CudaNdarray_HOST_STRIDES(
%(out)
s)[1],
CudaNdarray_HOST_STRIDES(
%(out)
s)[0], CudaNdarray_HOST_STRIDES(
%(out)
s)[1],
%(name)
s_xIdx,
%(name)
s_xIdx,
PyArray_DIM(
%(xIdx)
s, 1),
%(name)
s_yIdx);
%(name)
s_yIdx
, PyArray_DIM(
%(yIdx)
s, 1)
);
}
}
{
{
cublasStatus_t err;
cublasStatus_t err;
err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_T,
err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_T,
CudaNdarray_HOST_DIMS(
%(y)
s)[
1], CudaNdarray_HOST_DIMS(
%(x)
s)[1
], 1,
CudaNdarray_HOST_DIMS(
%(y)
s)[
2], CudaNdarray_HOST_DIMS(
%(x)
s)[2
], 1,
(float *)PyArray_GETPTR1(
%(alpha)
s, 0),
%(name)
s_y_list,
(float *)PyArray_GETPTR1(
%(alpha)
s, 0),
%(name)
s_y_list,
CudaNdarray_HOST_STRIDES(
%(y)
s)[
0
],
%(name)
s_x_list,
CudaNdarray_HOST_STRIDES(
%(y)
s)[
1
],
%(name)
s_x_list,
CudaNdarray_HOST_STRIDES(
%(x)
s)[
0
], (float *)PyArray_GETPTR1(
%(beta)
s, 0),
CudaNdarray_HOST_STRIDES(
%(x)
s)[
1
], (float *)PyArray_GETPTR1(
%(beta)
s, 0),
%(name)
s_out_list, CudaNdarray_HOST_STRIDES(
%(out)
s)[2],
%(name)
s_out_list, CudaNdarray_HOST_STRIDES(
%(out)
s)[2],
CudaNdarray_HOST_DIMS(
%(x)
s)[0] * CudaNdarray_HOST_DIMS(
%(y)
s)[0]);
CudaNdarray_HOST_DIMS(
%(x)
s)[0] *
CudaNdarray_HOST_DIMS(
%(x)
s)[1] *
CudaNdarray_HOST_DIMS(
%(y)
s)[1]);
if (err != CUBLAS_STATUS_SUCCESS) {
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
%(fail)
s
%(fail)
s
...
@@ -557,7 +596,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
...
@@ -557,7 +596,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
alpha
=
alpha
,
beta
=
beta
,
fail
=
sub
[
'fail'
])
alpha
=
alpha
,
beta
=
beta
,
fail
=
sub
[
'fail'
])
def
c_code_cache_version
(
self
):
def
c_code_cache_version
(
self
):
return
(
1
,)
return
(
2
,)
sparse_block_outer_ss
=
SparseBlockOuterSS
(
False
)
sparse_block_outer_ss
=
SparseBlockOuterSS
(
False
)
...
@@ -565,6 +604,12 @@ sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
...
@@ -565,6 +604,12 @@ sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
if
cuda_available
:
if
cuda_available
:
@opt.register_opt
()
@opt.local_optimizer
([
sparse_block_gemv_ss
],
inplace
=
True
)
def
local_inplace_blocksparse_gemv
(
node
):
if
node
.
op
==
sparse_block_gemv_ss
:
return
[
sparse_block_gemv_ss_inplace
(
*
node
.
inputs
)]
@opt.register_opt
()
@opt.register_opt
()
@opt.local_optimizer
([
sparse_block_outer_ss
],
inplace
=
True
)
@opt.local_optimizer
([
sparse_block_outer_ss
],
inplace
=
True
)
def
local_inplace_blocksparse_outer
(
node
):
def
local_inplace_blocksparse_outer
(
node
):
...
...
theano/sandbox/cuda/tests/test_blocksparse.py
浏览文件 @
42f4cb3e
...
@@ -28,6 +28,9 @@ else:
...
@@ -28,6 +28,9 @@ else:
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
mode_with_gpu
=
theano
.
compile
.
mode
.
get_default_mode
()
.
including
(
'gpu'
)
def
setup
():
utt
.
seed_rng
()
def
blocksparse_data
():
def
blocksparse_data
():
nInputBlock
=
128
nInputBlock
=
128
nOutputBlock
=
64
nOutputBlock
=
64
...
@@ -35,10 +38,11 @@ def blocksparse_data():
...
@@ -35,10 +38,11 @@ def blocksparse_data():
outputSize
=
30
outputSize
=
30
inputWindowSize
=
7
inputWindowSize
=
7
outputWindowSize
=
9
outputWindowSize
=
9
batchSize
=
4
input
=
randn
(
inputWindowSize
,
inputSize
)
.
astype
(
'float32'
)
input
=
randn
(
batchSize
,
inputWindowSize
,
inputSize
)
.
astype
(
'float32'
)
inputIndice
=
numpy
.
random
.
permutation
(
nInputBlock
)[:
inputWindowSize
]
inputIndice
=
numpy
.
vstack
(
numpy
.
random
.
permutation
(
nInputBlock
)[:
inputWindowSize
]
for
_
in
range
(
batchSize
))
outputIndice
=
numpy
.
random
.
permutation
(
nOutputBlock
)[:
outputWindowSize
]
outputIndice
=
numpy
.
vstack
(
numpy
.
random
.
permutation
(
nOutputBlock
)[:
outputWindowSize
]
for
_
in
range
(
batchSize
))
weight
=
randn
(
nInputBlock
,
nOutputBlock
,
inputSize
,
outputSize
)
.
astype
(
'float32'
)
weight
=
randn
(
nInputBlock
,
nOutputBlock
,
inputSize
,
outputSize
)
.
astype
(
'float32'
)
bias
=
randn
(
nOutputBlock
,
outputSize
)
.
astype
(
'float32'
)
bias
=
randn
(
nOutputBlock
,
outputSize
)
.
astype
(
'float32'
)
...
@@ -47,24 +51,24 @@ def blocksparse_data():
...
@@ -47,24 +51,24 @@ def blocksparse_data():
def
blocksparse
(
W
,
h
,
iIdx
,
b
,
oIdx
):
def
blocksparse
(
W
,
h
,
iIdx
,
b
,
oIdx
):
o
=
b
.
take
(
oIdx
,
axis
=
0
)
o
=
b
.
take
(
oIdx
,
axis
=
0
)
for
j
in
range
(
o
.
shape
[
0
]):
for
b
in
range
(
o
.
shape
[
0
]):
outputIdx
=
oIdx
[
j
]
for
j
in
range
(
o
.
shape
[
1
]):
outputIdx
=
oIdx
[
b
,
j
]
for
i
in
range
(
h
.
shape
[
0
]):
for
i
in
range
(
h
.
shape
[
1
]):
inputIdx
=
iIdx
[
i
]
inputIdx
=
iIdx
[
b
,
i
]
w
=
W
[
inputIdx
,
outputIdx
]
w
=
W
[
inputIdx
,
outputIdx
]
# this below is a gemv I think
# this below is a gemv I think
o
[
j
,
:]
+=
numpy
.
dot
(
h
[
i
],
w
)
o
[
b
,
j
,
:]
+=
numpy
.
dot
(
h
[
b
,
i
],
w
)
return
o
return
o
def
test_blocksparse
():
def
test_blocksparse
():
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
W
=
tensor
.
ftensor4
()
W
=
tensor
.
ftensor4
()
h
=
tensor
.
f
matrix
()
h
=
tensor
.
f
tensor3
()
iIdx
=
tensor
.
l
vector
()
iIdx
=
tensor
.
l
matrix
()
oIdx
=
tensor
.
l
vector
()
oIdx
=
tensor
.
l
matrix
()
o
=
sparse_block_dot_SS
(
W
,
h
,
iIdx
,
b
,
oIdx
)
o
=
sparse_block_dot_SS
(
W
,
h
,
iIdx
,
b
,
oIdx
)
...
@@ -77,14 +81,16 @@ def test_blocksparse():
...
@@ -77,14 +81,16 @@ def test_blocksparse():
utt
.
assert_allclose
(
ref_out
,
th_out
)
utt
.
assert_allclose
(
ref_out
,
th_out
)
test_blocksparse
.
setup
=
setup
# test the fortan order for W (which can happen in the grad for some graphs).
# test the fortan order for W (which can happen in the grad for some graphs).
def
test_blocksparseF
():
def
test_blocksparseF
():
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
W
=
tensor
.
ftensor4
()
W
=
tensor
.
ftensor4
()
h
=
tensor
.
f
matrix
()
h
=
tensor
.
f
tensor3
()
iIdx
=
tensor
.
l
vector
()
iIdx
=
tensor
.
l
matrix
()
oIdx
=
tensor
.
l
vector
()
oIdx
=
tensor
.
l
matrix
()
o
=
sparse_block_dot_SS
(
GpuDimShuffle
((
False
,
False
,
False
,
False
),
o
=
sparse_block_dot_SS
(
GpuDimShuffle
((
False
,
False
,
False
,
False
),
(
0
,
1
,
3
,
2
))(
(
0
,
1
,
3
,
2
))(
...
@@ -102,9 +108,9 @@ def test_blocksparseF():
...
@@ -102,9 +108,9 @@ def test_blocksparseF():
def
test_blocksparse_grad
():
def
test_blocksparse_grad
():
h_val
=
randn
(
2
,
3
)
.
astype
(
'float32'
)
h_val
=
randn
(
1
,
2
,
3
)
.
astype
(
'float32'
)
iIdx_val
=
numpy
.
random
.
permutation
(
3
)[:
2
]
iIdx_val
=
numpy
.
random
.
permutation
(
3
)[:
2
]
[
None
,
:]
oIdx_val
=
numpy
.
random
.
permutation
(
3
)[:
2
]
oIdx_val
=
numpy
.
random
.
permutation
(
3
)[:
2
]
[
None
,
:]
W_val
=
randn
(
3
,
3
,
3
,
4
)
.
astype
(
'float32'
)
W_val
=
randn
(
3
,
3
,
3
,
4
)
.
astype
(
'float32'
)
b_val
=
randn
(
3
,
4
)
.
astype
(
'float32'
)
b_val
=
randn
(
3
,
4
)
.
astype
(
'float32'
)
...
@@ -120,9 +126,9 @@ def test_blocksparse_grad():
...
@@ -120,9 +126,9 @@ def test_blocksparse_grad():
def
test_blocksparse_grad_shape
():
def
test_blocksparse_grad_shape
():
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
W
=
tensor
.
ftensor4
()
W
=
tensor
.
ftensor4
()
h
=
tensor
.
f
matrix
()
h
=
tensor
.
f
tensor3
()
iIdx
=
tensor
.
l
vector
()
iIdx
=
tensor
.
l
matrix
()
oIdx
=
tensor
.
l
vector
()
oIdx
=
tensor
.
l
matrix
()
o
=
sparse_block_gemv_ss
(
b
.
take
(
oIdx
,
axis
=
0
),
W
,
h
,
iIdx
,
oIdx
)
o
=
sparse_block_gemv_ss
(
b
.
take
(
oIdx
,
axis
=
0
),
W
,
h
,
iIdx
,
oIdx
)
go
=
theano
.
grad
(
o
.
sum
(),
[
b
,
W
,
h
])
go
=
theano
.
grad
(
o
.
sum
(),
[
b
,
W
,
h
])
...
@@ -141,9 +147,9 @@ def test_blocksparse_grad_shape():
...
@@ -141,9 +147,9 @@ def test_blocksparse_grad_shape():
def
test_blocksparse_grad_merge
():
def
test_blocksparse_grad_merge
():
b
=
tensor
.
fmatrix
()
b
=
tensor
.
fmatrix
()
h
=
tensor
.
f
matrix
()
h
=
tensor
.
f
tensor3
()
iIdx
=
tensor
.
l
vector
()
iIdx
=
tensor
.
l
matrix
()
oIdx
=
tensor
.
l
vector
()
oIdx
=
tensor
.
l
matrix
()
W_val
,
h_val
,
iIdx_val
,
b_val
,
oIdx_val
=
blocksparse_data
()
W_val
,
h_val
,
iIdx_val
,
b_val
,
oIdx_val
=
blocksparse_data
()
W
=
float32_shared_constructor
(
W_val
)
W
=
float32_shared_constructor
(
W_val
)
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论