提交 ed244b6b authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

C code that uses SgemmBatched and a kernel to initialize the list of stuff.

上级 c774e32e
......@@ -70,19 +70,14 @@ def ger(alpha, x, y, A):
class SparseBlockGemvSS(GpuOp):
def __init__(self, inplace):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other) and self.inplace == other.inplace
return type(self) == type(other)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
return hash(type(self))
def __str__(self):
return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")
return "SparseBlockGemvSS"
def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o)
......@@ -100,13 +95,92 @@ class SparseBlockGemvSS(GpuOp):
return Apply(self, [o, W, h, inputIdx, outputIdx],
[o.type()])
def c_support_code(self):
return """
// This is NOT batch-ready
__global__ void
SparseBlockGemv_fill_lists(
int n,
const float **inp_list,
float **out_list,
const float **W_list,
const float *W, int W_str_0, int W_str_1,
const float *h, int h_str_0,
float *outB, int o_str_0, int o_str_1,
const npy_intp *iIdx,
const npy_intp *oIdx
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int p = i + j * blockDim.x * gridDim.x;
if (p >= n) return;
inp_list[p] = &h[i * h_str_0];
out_list[p] = &outB[i * o_str_0 + j * o_str_1];
W_list[p] = &W[iIdx[i] * W_str_0 + oIdx[j] * W_str_1];
}
static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
if (aa == NULL) { return -1; }
err = cudaMemcpy(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
cudaMemcpyHostToDevice);
Py_DECREF(aa);
if (err != cudaSuccess) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy index data to GPU");
return -1;
}
return 0;
}
"""
def c_support_code_apply(self, node, nodename):
return """
/* Statics are initialized with 0 */
static float *%(n)s_outB;
static size_t %(n)s_outB_size;
static const float **%(n)s_inp_list;
static float **%(n)s_out_list;
static const float **%(n)s_W_list;
static size_t %(n)s_list_len;
static npy_intp *%(n)s_iIdx;
static size_t %(n)s_iIdx_len;
static npy_intp *%(n)s_oIdx;
static size_t %(n)s_oIdx_len;
// This is batch-ready
static int %(n)s_prep(int b, int i, int j, int outsize) {
int s = b*i*j;
if (%(n)s_list_len < s) {
cudaFree(%(n)s_inp_list);
cudaFree(%(n)s_out_list);
cudaFree(%(n)s_W_list);
if (cudaMalloc(&%(n)s_inp_list, s*sizeof(float *)) != cudaSuccess) return -1;
if (cudaMalloc(&%(n)s_out_list, s*sizeof(float *)) != cudaSuccess) return -1;
if (cudaMalloc(&%(n)s_W_list, s*sizeof(float *)) != cudaSuccess) return -1;
%(n)s_list_len = s;
}
if (%(n)s_outB_size < s*outsize) {
cudaFree(%(n)s_outB);
if (cudaMalloc(&%(n)s_outB, s*outsize*sizeof(float)) != cudaSuccess) return -1;
%(n)s_outB_size = s*outsize;
}
if (%(n)s_iIdx_len < b*i) {
cudaFree(%(n)s_iIdx);
if (cudaMalloc(&%(n)s_iIdx, b*i*sizeof(npy_intp)) != cudaSuccess) return -1;
}
if (%(n)s_oIdx_len < b*j) {
cudaFree(%(n)s_oIdx);
if (cudaMalloc(&%(n)s_oIdx, b*j*sizeof(npy_intp)) != cudaSuccess) return -1;
}
return 0;
}
""" % dict(n=nodename)
def perform(self, node, inputs, outputs):
o, W, h, inputIdx, outputIdx = inputs
out = outputs[0]
if not self.inplace:
o = o.copy()
dd = (o.shape[0] * h.shape[0],)
weightHostB = numpy.empty(dd, dtype='intp')
outputHostB = numpy.empty(dd, dtype='intp')
......@@ -141,80 +215,111 @@ class SparseBlockGemvSS(GpuOp):
beta=numpy.asarray(0.0, dtype='float32'))
outputBatchedG = to_cudandarray(outputBatched)
o += outputBatchedG.reduce_sum([1, 0, 0])
out[0] = o
out[0] = o + outputBatchedG.reduce_sum([1, 0, 0])
def c_code(self, node, nodename, inputs, outputs, sub):
o, W, h, inputIdx, outputIdx = inputs
out = outputs[0]
res = None
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
if (CudaNdarray_prep_output(&%(out)s, 2, CudaNdarray_HOST_DIMS(%(o)s)))
{
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
return """
if (%(name)s_prep(1, // NOT batch-ready
CudaNdarray_HOST_DIMS(%(h)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[1]) == -1) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory.");
%(fail)s
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy data to output");
%(fail)s
}
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
{
CudaNdarray *W_part = (CudaNdarray *)CudaNdarray_new_nd(2);
CudaNdarray *h_part = (CudaNdarray *)CudaNdarray_new_nd(1);
CudaNdarray *out_part = (CudaNdarray *)CudaNdarray_new_nd(1);
if (W_part == NULL || h_part == NULL || out_part == NULL) {
Py_XDECREF(W_part);
Py_XDECREF(h_part);
Py_XDECREF(out_part);
// NOT batch-ready
int dims[3];
dims[0] = 1; // This is to facilitate the reduction at the end.
dims[1] = CudaNdarray_HOST_DIMS(%(o)s)[0];
dims[2] = CudaNdarray_HOST_DIMS(%(o)s)[1];
if (CudaNdarray_prep_output(&%(out)s, 3, dims)) {
PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
%(fail)s
}
}
CudaNdarray_set_dim(W_part, 0, CudaNdarray_HOST_DIMS(%(W)s)[3]);
CudaNdarray_set_stride(W_part, 0, CudaNdarray_HOST_STRIDES(%(W)s)[3]);
CudaNdarray_set_dim(W_part, 1, CudaNdarray_HOST_DIMS(%(W)s)[2]);
CudaNdarray_set_stride(W_part, 1, CudaNdarray_HOST_STRIDES(%(W)s)[2]);
CudaNdarray_set_dim(h_part, 0, CudaNdarray_HOST_DIMS(%(h)s)[1]);
CudaNdarray_set_stride(h_part, 0, CudaNdarray_HOST_STRIDES(%(h)s)[1]);
CudaNdarray_set_dim(out_part, 0, CudaNdarray_HOST_DIMS(%(out)s)[1]);
CudaNdarray_set_stride(out_part, 0, CudaNdarray_HOST_STRIDES(%(out)s)[1]);
for (int j = 0; j < CudaNdarray_HOST_DIMS(%(o)s)[0]; j++) {
npy_intp out_id = *(dtype_%(outputIdx)s *)PyArray_GETPTR1(%(outputIdx)s, j);
CudaNdarray_set_device_data(out_part, CudaNdarray_DEV_DATA(%(out)s) +
CudaNdarray_HOST_STRIDES(%(out)s)[0] * j, %(out)s);
for (int i = 0; i < CudaNdarray_HOST_DIMS(%(h)s)[0]; i++) {
npy_intp inp_id = *(dtype_%(inputIdx)s *)PyArray_GETPTR1(%(inputIdx)s, i);
CudaNdarray_set_device_data(h_part, CudaNdarray_DEV_DATA(%(h)s) +
CudaNdarray_HOST_STRIDES(%(h)s)[0] * i, %(h)s);
CudaNdarray_set_device_data(W_part, CudaNdarray_DEV_DATA(%(W)s) +
(CudaNdarray_HOST_STRIDES(%(W)s)[0] * inp_id) +
(CudaNdarray_HOST_STRIDES(%(W)s)[1] * out_id), %(W)s);
if (CudaNdarray_sgemv(1.0f, W_part, h_part, 1.0f, out_part)) {
%(fail)s
}
// This is batch-ready
if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
{ %(fail)s }
if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
{ %(fail)s }
{ /* Prepare lists for the batch */
// NOT batch-ready
dim3 block;
block.x = CudaNdarray_HOST_DIMS(%(h)s)[0];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[0];
SparseBlockGemv_fill_lists<<<block, 1>>>(
block.x*block.y,
%(name)s_inp_list,
%(name)s_out_list,
%(name)s_W_list,
CudaNdarray_DEV_DATA(%(W)s),
CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
CudaNdarray_DEV_DATA(%(h)s), CudaNdarray_HOST_STRIDES(%(h)s)[0],
%(name)s_outB,
CudaNdarray_HOST_DIMS(%(o)s)[0] * CudaNdarray_HOST_DIMS(%(o)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1],
%(name)s_iIdx,
%(name)s_oIdx);
}
{ /* Run SgemmBatched */
float alpha = 1.0;
float beta = 0.0;
cublasStatus_t err;
cublasOperation_t transA = CUBLAS_OP_N;
int lda = CudaNdarray_HOST_STRIDES(%(W)s)[2];
if (lda == 1) {
transA = CUBLAS_OP_T;
lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
}
err = cublasSgemmBatched(handle, transA, CUBLAS_OP_N,
CudaNdarray_HOST_DIMS(%(o)s)[1], 1,
CudaNdarray_HOST_DIMS(%(h)s)[1], &alpha,
%(name)s_W_list, lda, %(name)s_inp_list,
CudaNdarray_HOST_STRIDES(%(h)s)[0],
&beta, %(name)s_out_list,
CudaNdarray_HOST_STRIDES(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(o)s)[0] *
CudaNdarray_HOST_DIMS(%(h)s)[0]);
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
%(fail)s
}
}
Py_DECREF(W_part);
Py_DECREF(h_part);
Py_DECREF(out_part);
{ /* Perform final reduction and add biases */
CudaNdarray *tmp;
int p[2];
p[0] = 1;
p[1] = 2;
tmp = (CudaNdarray *)CudaNdarray_new_nd(3);
if (tmp == NULL) { %(fail)s }
CudaNdarray_set_dim(tmp, 0, CudaNdarray_HOST_DIMS(%(h)s)[0]);
CudaNdarray_set_stride(tmp, 0, CudaNdarray_HOST_DIMS(%(o)s)[0] *
CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_dim(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[0]);
CudaNdarray_set_stride(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_dim(tmp, 2, CudaNdarray_HOST_DIMS(%(o)s)[1]);
CudaNdarray_set_stride(tmp, 2, 1);
CudaNdarray_set_device_data(tmp, %(name)s_outB, (PyObject *)NULL);
if (CudaNdarray_reduce_sum(%(out)s, tmp) ||
CudaNdarray_dimshuffle(%(out)s, 2, p)) {
Py_DECREF(tmp);
%(fail)s;
}
Py_DECREF(tmp);
if (CudaNdarray_inplace_add((PyObject *)%(out)s, (PyObject *)%(o)s) == NULL) {
%(fail)s;
}
}
// And we're done!
""" % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
W=W, fail=sub['fail'])
W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self):
return (1,)
return (2,)
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
......@@ -234,8 +339,7 @@ class SparseBlockGemvSS(GpuOp):
"grad of outputIdx makes no sense")]
sparse_block_gemv_ss = SparseBlockGemvSS(False)
sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
sparse_block_gemv_ss = SparseBlockGemvSS()
class SparseBlockOuterSS(GpuOp):
......
......@@ -17,7 +17,6 @@ from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
as_cuda_ndarray_variable)
from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
sparse_block_gemv_ss,
sparse_block_gemv_ss_inplace,
sparse_block_outer_ss,
sparse_block_outer_ss_inplace)
......@@ -136,26 +135,3 @@ def test_blocksparse_grad_shape():
assert b_g.shape == b_val.shape
assert h_g.shape == h_val.shape
assert W_g.shape == W_val.shape
class TestBlockSparseDot(TestCase, utt.TestOptimizationMixin):
def test_opt_inplace(self):
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.fmatrix()
iIdx = tensor.lvector()
oIdx = tensor.lvector()
o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
self.assertFunctionContains0(f, sparse_block_gemv_ss)
self.assertFunctionContains1(f, sparse_block_gemv_ss_inplace)
gW = theano.grad(o.sum(), [W])
f = theano.function([W, h, iIdx, b, oIdx], gW, mode=mode_with_gpu)
self.assertFunctionContains0(f, sparse_block_outer_ss)
self.assertFunctionContains1(f, sparse_block_outer_ss_inplace)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论