提交 eb4d52aa authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix most of the problems in blocksparse.

上级 7479d045
#section support_code_apply
int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
#endif
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
/* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(h)[1];
size_t maxj = PyGpuArray_DIMS(o)[1];
size_t maxb = PyGpuArray_DIMS(o)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(o)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(o)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = h->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
out_list[p] = o->ga.data;
offOut[p] = b * o_str_0 + j * o_str_1 + o->ga.offset;
W_list[p] = W->ga.data;
offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
*(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
W->ga.offset;
}
}
}
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
if (lda == 1) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
}
if (o->ga.typecode == GA_FLOAT) {
err = blas_ops->sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(o)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
} else if (o->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(o)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
} else {
err = GA_DEVSUP_ERROR;
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
gpudata **y_list = NULL;
size_t *offOut = NULL;
size_t *offX = NULL;
size_t *offY = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL)
return -1;
#endif
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (o_list == NULL || offOut == NULL ||
x_list == NULL || offX == NULL ||
y_list == NULL || offY == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = x->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
y_list[p] = y->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
o_list[p] = out->ga.data;
offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
}
}
}
ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
if (out->ga.typecode == GA_FLOAT) {
err = blas_ops->sgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(double *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else {
err = GA_DEVSUP_ERROR;
}
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "sgerBatch failed");
return -1;
}
*_out = out;
return 0;
}
...@@ -8,7 +8,7 @@ import theano ...@@ -8,7 +8,7 @@ import theano
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse from theano.ifelse import IfElse
...@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast ...@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (AbstractConv2d, from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs) AbstractConv2d_gradInputs)
...@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemmbatch_no_inplace) gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
...@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs): ...@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
return local_opt return local_opt
return f return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpuarray', *tags)
return local_opt
return f
register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name='gpua_constant_folding')( register_opt(final_opt=True, name='gpua_constant_folding')(
tensor.opt.constant_folding) tensor.opt.constant_folding)
...@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name): ...@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node, context_name): def local_gpua_advanced_incsubtensor(node, context_name):
context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if get_context(context_name).kind != 'cuda': if context.kind != 'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = node.inputs
...@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name): ...@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
y = tensor.cast(y, dtype) y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major'] compute_capability = int(context.bin_id[-2])
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1( return GpuAdvancedIncSubtensor1(
...@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d() ...@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
""" """
@register_opt('fast_compile')
@op_lifter([SparseBlockGemv])
def local_lift_sparseblockgemv(node, context_name):
return GpuSparseBlockGemv(node.op.inplace)
@register_opt('fast_compile')
@op_lifter([SparseBlockOuter])
def local_lift_sparseblockouter(node, context_name):
return GpuSparseBlockOuter(node.op.inplace)
@register_inplace()
@local_optimizer([GpuSparseBlockGemv], inplace=True)
def local_inplace_sparseblockgemv(node):
if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
@register_inplace()
@local_optimizer([GpuSparseBlockOuter], inplace=True)
def local_inplace_sparseblockouter(node):
if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
# This deals with any abstract convs that have a transfer somewhere # This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
......
...@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester): ...@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps) utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
def test_sparseblockgemv_grad_1(self): def test_sparseblockgemv_grad_1(self):
""" # Test that we correctly handle cases where dimensions are 1.
Test that we correctly handle cases where dimensions are 1.
"""
h_val = randn(1, 1, 1).astype('float32') h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :] iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :] oIdx_val = numpy.random.permutation(1)[:1][None, :]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论