提交 42f4cb3e authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Add batch support to blocksparse.

上级 47d59687
......@@ -74,24 +74,29 @@ def ger(alpha, x, y, A):
class SparseBlockGemvSS(GpuOp):
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
return type(self) == type(other) and self.inplace == other.inplace
def __hash__(self):
return hash(type(self))
return hash(type(self)) ^ hash(self.inplace)
def __str__(self):
return "SparseBlockGemvSS"
return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o)
W = basic_ops.as_cuda_ndarray_variable(W)
h = basic_ops.as_cuda_ndarray_variable(h)
assert o.ndim == 2
assert o.ndim == 3
assert W.ndim == 4
assert h.ndim == 2
assert inputIdx.ndim == 1
assert outputIdx.ndim == 1
assert h.ndim == 3
assert inputIdx.ndim == 2
assert outputIdx.ndim == 2
assert 'int' in inputIdx.type.dtype
assert 'int' in outputIdx.type.dtype
......@@ -101,7 +106,6 @@ class SparseBlockGemvSS(GpuOp):
def c_support_code(self):
return """
// This is NOT batch-ready
__global__ void
SparseBlockGemv_fill_lists(
int n,
......@@ -109,18 +113,38 @@ const float **inp_list,
float **out_list,
const float **W_list,
const float *W, int W_str_0, int W_str_1,
const float *h, int h_str_0,
float *outB, int o_str_0, int o_str_1,
const npy_intp *iIdx,
const npy_intp *oIdx
const float *h, int h_str_0, int h_str_1,
float *outB, int o_str_0, int o_str_1, int o_str_2,
const npy_intp *iIdx, int iI_str_0,
const npy_intp *oIdx, int oI_str_0
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int p = i + j * blockDim.x * gridDim.x;
int b = threadIdx.z + blockDim.z * blockIdx.z;
int p = i + j * blockDim.x * gridDim.x +
b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
if (p >= n) return;
inp_list[p] = &h[i * h_str_0];
out_list[p] = &outB[i * o_str_0 + j * o_str_1];
W_list[p] = &W[iIdx[i] * W_str_0 + oIdx[j] * W_str_1];
inp_list[p] = &h[b * h_str_0 + i * h_str_1];
out_list[p] = &outB[b * o_str_0 + i * o_str_1 + j * o_str_2];
W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
oIdx[b*oI_str_0+j] * W_str_1];
}
__global__ void
SparseBlockGemv_reduce(
int red_dim,
float *outB, int i_str_0, int i_str_1, int i_str_2, int i_str_3,
float *out, int o_str_0, int o_str_1, int o_str_2
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = threadIdx.z + blockDim.z * blockIdx.z;
float s = 0.0;
float *oB = &outB[b * i_str_0 + i * i_str_2 + j * i_str_3];
for (int k = 0; k < red_dim; k++) {
s += oB[k * i_str_1];
}
out[b * o_str_0 + i * o_str_1 + j * o_str_2] += s;
}
static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
......@@ -152,7 +176,6 @@ const npy_intp *oIdx
static npy_intp *%(n)s_oIdx;
static size_t %(n)s_oIdx_len;
// This is batch-ready
static int %(n)s_prep(int b, int i, int j, int outsize) {
int s = b*i*j;
if (%(n)s_list_len < s) {
......@@ -187,22 +210,25 @@ const npy_intp *oIdx
o, W, h, inputIdx, outputIdx = inputs
out = outputs[0]
dd = (o.shape[0] * h.shape[0],)
dd = (o.shape[0] * o.shape[1] * h.shape[1],)
weightHostB = numpy.empty(dd, dtype='intp')
outputHostB = numpy.empty(dd, dtype='intp')
inputHostB = numpy.empty(dd, dtype='intp')
outputBatched = pycuda.gpuarray.GPUArray((h.shape[0], o.shape[0], o.shape[1]), dtype='float32')
outputBatched = pycuda.gpuarray.GPUArray((h.shape[0], h.shape[1],
o.shape[1], o.shape[2]),
dtype='float32')
k = 0
for j in range(o.shape[0]):
out_id = outputIdx[j]
for i in range(h.shape[0]):
inp_id = inputIdx[i]
weightHostB[k] = W[inp_id, out_id].gpudata
outputHostB[k] = outputBatched[i, j].ptr
inputHostB[k] = h[i].gpudata
k += 1
for b in range(o.shape[0]):
for j in range(o.shape[1]):
out_id = outputIdx[b, j]
for i in range(h.shape[1]):
inp_id = inputIdx[b, i]
weightHostB[k] = W[inp_id, out_id].gpudata
outputHostB[k] = outputBatched[b, i, j].ptr
inputHostB[k] = h[b, i].gpudata
k += 1
weightB = pycuda.gpuarray.to_gpu(weightHostB)
inputB = pycuda.gpuarray.to_gpu(inputHostB)
......@@ -215,13 +241,13 @@ const npy_intp *oIdx
lda = W.strides[3]
gemm_batched(tA, 'n', o.shape[1], 1, h.shape[1],
weightB, lda, inputB, h.strides[0],
outputB, o.strides[0],
gemm_batched(tA, 'n', o.shape[2], 1, h.shape[2],
weightB, lda, inputB, h.strides[1],
outputB, o.strides[1],
beta=numpy.asarray(0.0, dtype='float32'))
outputBatchedG = to_cudandarray(outputBatched)
out[0] = o + outputBatchedG.reduce_sum([1, 0, 0])
out[0] = o + outputBatchedG.reduce_sum([0, 1, 0, 0])
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
......@@ -230,27 +256,34 @@ const npy_intp *oIdx
o, W, h, inputIdx, outputIdx = inputs
out = outputs[0]
return """
if (%(name)s_prep(1, // NOT batch-ready
CudaNdarray_HOST_DIMS(%(h)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[1]) == -1) {
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
if (CudaNdarray_prep_output(&%(out)s, 3, CudaNdarray_HOST_DIMS(%(o)s)))
{
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
%(fail)s
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy data to output");
%(fail)s
}
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(h)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[2]) == -1) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory.");
%(fail)s
}
{
// NOT batch-ready
int dims[3];
dims[0] = 1; // This is to facilitate the reduction at the end.
dims[1] = CudaNdarray_HOST_DIMS(%(o)s)[0];
dims[2] = CudaNdarray_HOST_DIMS(%(o)s)[1];
if (CudaNdarray_prep_output(&%(out)s, 3, dims)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
%(fail)s
}
}
// This is batch-ready
if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
{ %(fail)s }
if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
......@@ -258,21 +291,23 @@ const npy_intp *oIdx
{ /* Prepare lists for the batch */
// NOT batch-ready
dim3 block;
block.x = CudaNdarray_HOST_DIMS(%(h)s)[0];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[0];
block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
block.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
SparseBlockGemv_fill_lists<<<block, 1>>>(
block.x*block.y,
block.x*block.y*block.z,
%(name)s_inp_list,
%(name)s_out_list,
%(name)s_W_list,
CudaNdarray_DEV_DATA(%(W)s),
CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
CudaNdarray_DEV_DATA(%(h)s), CudaNdarray_HOST_STRIDES(%(h)s)[0],
CudaNdarray_DEV_DATA(%(h)s), CudaNdarray_HOST_STRIDES(%(h)s)[0], CudaNdarray_HOST_STRIDES(%(h)s)[1],
%(name)s_outB,
CudaNdarray_HOST_DIMS(%(o)s)[0] * CudaNdarray_HOST_DIMS(%(o)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1],
%(name)s_iIdx,
%(name)s_oIdx);
CudaNdarray_HOST_DIMS(%(h)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(o)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(o)s)[2],
%(name)s_iIdx, PyArray_DIM(%(inputIdx)s, 1),
%(name)s_oIdx, PyArray_DIM(%(outputIdx)s, 1));
}
{ /* Run SgemmBatched */
float alpha = 1.0;
......@@ -285,50 +320,46 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
}
err = cublasSgemmBatched(handle, transA, CUBLAS_OP_N,
CudaNdarray_HOST_DIMS(%(o)s)[1], 1,
CudaNdarray_HOST_DIMS(%(h)s)[1], &alpha,
CudaNdarray_HOST_DIMS(%(o)s)[2], 1,
CudaNdarray_HOST_DIMS(%(h)s)[2], &alpha,
%(name)s_W_list, lda, %(name)s_inp_list,
CudaNdarray_HOST_STRIDES(%(h)s)[0],
CudaNdarray_HOST_STRIDES(%(h)s)[1],
&beta, %(name)s_out_list,
CudaNdarray_HOST_STRIDES(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[0] *
CudaNdarray_HOST_DIMS(%(h)s)[0]);
CudaNdarray_HOST_STRIDES(%(o)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1] *
CudaNdarray_HOST_DIMS(%(h)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[0]);
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
%(fail)s
}
}
{ /* Perform final reduction and add biases */
CudaNdarray *tmp;
int p[2];
p[0] = 1;
p[1] = 2;
tmp = (CudaNdarray *)CudaNdarray_new_nd(3);
if (tmp == NULL) { %(fail)s }
CudaNdarray_set_dim(tmp, 0, CudaNdarray_HOST_DIMS(%(h)s)[0]);
CudaNdarray_set_stride(tmp, 0, CudaNdarray_HOST_DIMS(%(o)s)[0] *
CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_dim(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[0]);
CudaNdarray_set_stride(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_dim(tmp, 2, CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_stride(tmp, 2, 1);
CudaNdarray_set_device_data(tmp, %(name)s_outB, (PyObject *)NULL);
if (CudaNdarray_reduce_sum(%(out)s, tmp) ||
CudaNdarray_dimshuffle(%(out)s, 2, p)) {
Py_DECREF(tmp);
%(fail)s;
}
Py_DECREF(tmp);
if (CudaNdarray_inplace_add((PyObject *)%(out)s, (PyObject *)%(o)s) == NULL) {
%(fail)s;
}
dim3 block;
block.x = CudaNdarray_HOST_DIMS(%(o)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[2];
block.z = CudaNdarray_HOST_DIMS(%(o)s)[0];
SparseBlockGemv_reduce<<<block, 1>>>(
CudaNdarray_HOST_DIMS(%(h)s)[1],
%(name)s_outB,
CudaNdarray_HOST_DIMS(%(h)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(o)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(o)s)[2],
1,
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0],
CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_STRIDES(%(out)s)[2]);
}
// And we're done!
""" % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self):
return (3,)
return (5,)
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
......@@ -348,7 +379,8 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
"grad of outputIdx makes no sense")]
sparse_block_gemv_ss = SparseBlockGemvSS()
sparse_block_gemv_ss = SparseBlockGemvSS(False)
sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
class SparseBlockOuterSS(GpuOp):
......@@ -385,27 +417,28 @@ class SparseBlockOuterSS(GpuOp):
if not self.inplace:
o = o.copy()
dd = (x.shape[0] * y.shape[0],)
dd = (x.shape[0] * x.shape[1] * y.shape[1],)
xHostB = numpy.empty(dd, dtype='intp')
yHostB = numpy.empty(dd, dtype='intp')
outHostB = numpy.empty(dd, dtype='intp')
k = 0
for j in range(y.shape[0]):
out_id = yIdx[j]
for i in range(x.shape[0]):
inp_id = xIdx[i]
outHostB[k] = o[inp_id, out_id].gpudata
xHostB[k] = x[i].gpudata
yHostB[k] = y[j].gpudata
k += 1
for b in range(x.shape[0]):
for j in range(y.shape[1]):
out_id = yIdx[b, j]
for i in range(x.shape[1]):
inp_id = xIdx[b, i]
outHostB[k] = o[inp_id, out_id].gpudata
xHostB[k] = x[b, i].gpudata
yHostB[k] = y[b, j].gpudata
k += 1
xB = pycuda.gpuarray.to_gpu(xHostB)
yB = pycuda.gpuarray.to_gpu(yHostB)
outB = pycuda.gpuarray.to_gpu(outHostB)
gemm_batched('n', 't', y.shape[1], x.shape[1], 1,
yB, y.strides[0], xB, x.strides[0],
gemm_batched('n', 't', y.shape[2], x.shape[2], 1,
yB, y.strides[1], xB, x.strides[1],
outB, o.strides[2],
alpha=alpha, beta=beta)
......@@ -422,19 +455,22 @@ int n,
const float **x_list,
const float **y_list,
float **out_list,
const float *x, int x_str_0,
const float *y, int y_str_0,
const float *x, int x_str_0, int x_str_1,
const float *y, int y_str_0, int y_str_1,
float *out, int o_str_0, int o_str_1,
const npy_intp *xIdx,
const npy_intp *yIdx
const npy_intp *xIdx, int xI_str_0,
const npy_intp *yIdx, int yI_str_0
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int p = i + j * blockDim.x * gridDim.x;
int b = threadIdx.z + blockDim.z * blockIdx.z;
int p = i + j * blockDim.x * gridDim.x +
b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
if (p >= n) return;
x_list[p] = &x[i * x_str_0];
y_list[p] = &y[j * y_str_0];
out_list[p] = &out[xIdx[i] * o_str_0 + yIdx[j] * o_str_1];
x_list[p] = &x[b * x_str_0 + i * x_str_1];
y_list[p] = &y[b * x_str_0 + j * y_str_1];
out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
yIdx[b * yI_str_0 + j] * o_str_1];
}
static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
......@@ -464,7 +500,6 @@ static size_t %(n)s_xIdx_len;
static npy_intp *%(n)s_yIdx;
static size_t %(n)s_yIdx_len;
// This is batch-ready
static int %(n)s_prep(int b, int i, int j) {
int s = b*i*j;
if (%(n)s_list_len < s) {
......@@ -515,8 +550,9 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
if (%(name)s_prep(1, CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(y)s)[0]) == -1) {
if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(y)s)[1]) == -1) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
%(fail)s
}
......@@ -526,29 +562,32 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
{ %(fail)s }
{
dim3 block;
block.x = CudaNdarray_HOST_DIMS(%(x)s)[0];
block.y = CudaNdarray_HOST_DIMS(%(y)s)[0];
block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
block.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
SparseBlockOuter_fill_lists<<<block, 1>>>(
block.x * block.y,
block.x * block.y * block.z,
%(name)s_x_list,
%(name)s_y_list,
%(name)s_out_list,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_xIdx,
%(name)s_yIdx);
%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
%(name)s_yIdx, PyArray_DIM(%(yIdx)s, 1));
}
{
cublasStatus_t err;
err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_T,
CudaNdarray_HOST_DIMS(%(y)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1], 1,
CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2], 1,
(float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list,
CudaNdarray_HOST_STRIDES(%(y)s)[0], %(name)s_x_list,
CudaNdarray_HOST_STRIDES(%(x)s)[0], (float *)PyArray_GETPTR1(%(beta)s, 0),
CudaNdarray_HOST_STRIDES(%(y)s)[1], %(name)s_x_list,
CudaNdarray_HOST_STRIDES(%(x)s)[1], (float *)PyArray_GETPTR1(%(beta)s, 0),
%(name)s_out_list, CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_DIMS(%(x)s)[0] * CudaNdarray_HOST_DIMS(%(y)s)[0]);
CudaNdarray_HOST_DIMS(%(x)s)[0] *
CudaNdarray_HOST_DIMS(%(x)s)[1] *
CudaNdarray_HOST_DIMS(%(y)s)[1]);
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
%(fail)s
......@@ -557,7 +596,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
alpha=alpha, beta=beta, fail=sub['fail'])
def c_code_cache_version(self):
return (1,)
return (2,)
sparse_block_outer_ss = SparseBlockOuterSS(False)
......@@ -565,6 +604,12 @@ sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)
if cuda_available:
@opt.register_opt()
@opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
def local_inplace_blocksparse_gemv(node):
if node.op == sparse_block_gemv_ss:
return [sparse_block_gemv_ss_inplace(*node.inputs)]
@opt.register_opt()
@opt.local_optimizer([sparse_block_outer_ss], inplace=True)
def local_inplace_blocksparse_outer(node):
......
......@@ -28,6 +28,9 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def setup():
utt.seed_rng()
def blocksparse_data():
nInputBlock = 128
nOutputBlock = 64
......@@ -35,10 +38,11 @@ def blocksparse_data():
outputSize = 30
inputWindowSize = 7
outputWindowSize = 9
batchSize = 4
input = randn(inputWindowSize, inputSize).astype('float32')
inputIndice = numpy.random.permutation(nInputBlock)[:inputWindowSize]
outputIndice = numpy.random.permutation(nOutputBlock)[:outputWindowSize]
input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
inputIndice = numpy.vstack(numpy.random.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize))
outputIndice = numpy.vstack(numpy.random.permutation(nOutputBlock)[:outputWindowSize] for _ in range(batchSize))
weight = randn(nInputBlock, nOutputBlock, inputSize, outputSize).astype('float32')
bias = randn(nOutputBlock, outputSize).astype('float32')
......@@ -47,24 +51,24 @@ def blocksparse_data():
def blocksparse(W, h, iIdx, b, oIdx):
o = b.take(oIdx, axis=0)
for j in range(o.shape[0]):
outputIdx = oIdx[j]
for i in range(h.shape[0]):
inputIdx = iIdx[i]
w = W[inputIdx, outputIdx]
# this below is a gemv I think
o[j, :] += numpy.dot(h[i], w)
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
# this below is a gemv I think
o[b, j, :] += numpy.dot(h[b, i], w)
return o
def test_blocksparse():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.fmatrix()
iIdx = tensor.lvector()
oIdx = tensor.lvector()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
......@@ -77,14 +81,16 @@ def test_blocksparse():
utt.assert_allclose(ref_out, th_out)
test_blocksparse.setup = setup
# test the fortan order for W (which can happen in the grad for some graphs).
def test_blocksparseF():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.fmatrix()
iIdx = tensor.lvector()
oIdx = tensor.lvector()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
(0, 1, 3, 2))(
......@@ -102,9 +108,9 @@ def test_blocksparseF():
def test_blocksparse_grad():
h_val = randn(2, 3).astype('float32')
iIdx_val = numpy.random.permutation(3)[:2]
oIdx_val = numpy.random.permutation(3)[:2]
h_val = randn(1, 2, 3).astype('float32')
iIdx_val = numpy.random.permutation(3)[:2][None, :]
oIdx_val = numpy.random.permutation(3)[:2][None, :]
W_val = randn(3, 3, 3, 4).astype('float32')
b_val = randn(3, 4).astype('float32')
......@@ -120,9 +126,9 @@ def test_blocksparse_grad():
def test_blocksparse_grad_shape():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.fmatrix()
iIdx = tensor.lvector()
oIdx = tensor.lvector()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
go = theano.grad(o.sum(), [b, W, h])
......@@ -141,9 +147,9 @@ def test_blocksparse_grad_shape():
def test_blocksparse_grad_merge():
b = tensor.fmatrix()
h = tensor.fmatrix()
iIdx = tensor.lvector()
oIdx = tensor.lvector()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
W = float32_shared_constructor(W_val)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论