提交 eb4d52aa authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix most of the problems in blocksparse.

上级 7479d045
#section support_code_apply
int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
#endif
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
/* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(h)[1];
size_t maxj = PyGpuArray_DIMS(o)[1];
size_t maxb = PyGpuArray_DIMS(o)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(o)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(o)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = h->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
out_list[p] = o->ga.data;
offOut[p] = b * o_str_0 + j * o_str_1 + o->ga.offset;
W_list[p] = W->ga.data;
offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
*(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
W->ga.offset;
}
}
}
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
if (lda == 1) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
}
if (o->ga.typecode == GA_FLOAT) {
err = blas_ops->sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(o)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
} else if (o->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(o)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
} else {
err = GA_DEVSUP_ERROR;
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
gpudata **y_list = NULL;
size_t *offOut = NULL;
size_t *offX = NULL;
size_t *offY = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL)
return -1;
#endif
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (o_list == NULL || offOut == NULL ||
x_list == NULL || offX == NULL ||
y_list == NULL || offY == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = x->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
y_list[p] = y->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
o_list[p] = out->ga.data;
offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
}
}
}
ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
if (out->ga.typecode == GA_FLOAT) {
err = blas_ops->sgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(double *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else {
err = GA_DEVSUP_ERROR;
}
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "sgerBatch failed");
return -1;
}
*_out = out;
return 0;
}
from __future__ import absolute_import, print_function, division
import logging
import os
import numpy
from theano import Op, Apply, tensor
from theano import Apply, tensor
from theano.gof import COp
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from .type import gpu_context_type
from .basic_ops import as_gpuarray_variable, infer_context_name
_logger = logging.getLogger('theano.sandbox.gpuarray.blocksparse')
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
class GpuSparseBlockGemv(Op):
class GpuSparseBlockGemv(COp):
"""
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
......@@ -27,14 +25,32 @@ class GpuSparseBlockGemv(Op):
function for a stable interface.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_headers(self):
return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
'<gpuarray_helper.h>']
def make_node(self, o, W, h, inputIdx, outputIdx):
ctx = infer_context(o, W, h)
ctx = infer_context_name(o, W, h)
o = as_gpuarray_variable(o, ctx)
W = as_gpuarray_variable(W, ctx)
h = as_gpuarray_variable(h, ctx)
......@@ -53,123 +69,6 @@ class GpuSparseBlockGemv(Op):
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def c_code(self, node, nodename, inputs, outputs, sub):
o, W, h, inputIdx, outputIdx = inputs
typecode = o.type.typecode
out = outputs[0]
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
%(out)s = theano_try_copy(%(out)s, %(o)s);
if (%(out)s == NULL) {
// Error already set
%(fail)s
}
""" % dict(out=out, o=o, typecode=typecode, fail=sub['fail'], ctx=sub['params'])
return res + """{
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
{ /* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(%(h)s)[1];
size_t maxj = PyGpuArray_DIMS(%(o)s)[1];
size_t maxb = PyGpuArray_DIMS(%(o)s)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(%(h)s)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(%(h)s)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(%(o)s)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(%(o)s)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(%(W)s)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(%(W)s)[1];
W_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offW = calloc(sizof(size_t), maxi * maxj * maxb);
inp_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offInp = calloc(sizof(size_t), maxi * maxj * maxb);
out_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offOut = calloc(sizof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
%(fail)s
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = %(h)s->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + %(h)s->ga.offset;
out_list[p] = %(o)s->ga.data;
outInp[p] = b * o_str_0 + j * o_str_1 + %(o)s->ga.offset;
W_list[p] = %(W)s->ga.data;
offW[p] = *(%(inputIdx)s_DTYPE *)PyArray_GETPTR2(%(inputIdx)s, b, i) * W_str_0 + *(%(outputIdx)s_DTYPE *)PyArray_GETPTR2(%(outputIdx)s, b, j) * W_str_1 + %(W)s->ga.offset;
}
}
}
}
{ /* Run XgemvBatched */
int err;
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(%(W)s)[2];
if (lda == sizeof(float)) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(%(W)s)[3];
}
if (%(typecode)s == GA_FLOAT) {
err = blas_ops->sgemvBatch(cb_c, transA,
PyGpuArray_DIMS(%(o)s)[2],
PyGpuArray_DIMS(%(h)s)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(%(h)s)[2],
1, out_list, offOut, PyGpuArray_STRIDES(%(o)s)[2],
PyGpuArray_DIMS(%(o)s)[1] * PyGpuArray_DIMS(%(h)s)[1] * PyGpuArray_DIMS(%(o)s)[0], 0);
} else if (%(typecode)s == GA_DOUBLE) {
err = blas_ops->dgemvBatch(cb_c, transA,
PyGpuArray_DIMS(%(o)s)[2],
PyGpuArray_DIMS(%(h)s)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(%(h)s)[2],
1, out_list, offOut, PyGpuArray_STRIDES(%(o)s)[2],
PyGpuArray_DIMS(%(o)s)[1] * PyGpuArray_DIMS(%(h)s)[1] * PyGpuArray_DIMS(%(o)s)[0], 0);
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "SgemvBatched failed(%%s)",
cublasGetErrorString(err));
%(fail)s
}
}
// And we're done!
}""" % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self):
return ()
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
......@@ -191,7 +90,7 @@ gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class GpuSparseBlockOuter(GpuOp):
class GpuSparseBlockOuter(COp):
"""
GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
......@@ -201,17 +100,29 @@ class GpuSparseBlockOuter(GpuOp):
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
ctx = infer_context_name(o, x, y)
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = basic_ops.as_cuda_ndarray_variable(o)
x = basic_ops.as_cuda_ndarray_variable(x)
y = basic_ops.as_cuda_ndarray_variable(y)
o = as_gpuarray_variable(o, ctx)
x = as_gpuarray_variable(x, ctx)
y = as_gpuarray_variable(y, ctx)
if alpha is None:
alpha = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
......@@ -220,121 +131,12 @@ class GpuSparseBlockOuter(GpuOp):
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def c_support_code(self):
return """
__global__ void
SparseBlockOuter_fill_lists(
int maxi, int maxj,
const float **x_list,
const float **y_list,
float **out_list,
const float *x, int x_str_0, int x_str_1,
const float *y, int y_str_0, int y_str_1,
float *out, int o_str_0, int o_str_1,
const npy_intp *xIdx, int xI_str_0,
const npy_intp *yIdx, int yI_str_0
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = blockIdx.z;
if (i >= maxi || j >= maxj) return;
int p = i + j * maxi + b * maxi * maxj;
x_list[p] = &x[b * x_str_0 + i * x_str_1];
y_list[p] = &y[b * y_str_0 + j * y_str_1];
out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
yIdx[b * yI_str_0 + j] * o_str_1];
}
"""
def c_code(self, node, name, inputs, outputs, sub):
o, x, y, xIdx, yIdx, alpha = inputs
out = outputs[0]
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
%(out)s = theano_try_copy(%(out)s, %(o)s);
if (%(out)s == NULL) {
// Error already set
%(fail)s
}
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
{
size_t maxi = PyGpuArray_DIMS(%(x)s)[1];
size_t maxj = PyGpuArray_DIMS(%(y)s)[1];
size_t maxb = PyGpuArray_DIMS(%(x)s)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(%(x)s)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(%(x)s)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(%(y)s)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(%(y)s)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(%(out)s)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(%(out)s)[1];
o_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offOut = calloc(sizof(size_t), maxi * maxj * maxb);
x_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offX = calloc(sizof(size_t), maxi * maxj * maxb);
y_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
offY = calloc(sizof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
%(fail)s
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = %(x)s->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + %(x)s->ga.offset;
y_list[p] = %(y)s->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + %(y)s->ga.offset;
out_list[p] = %(out)s->ga.data;
offOut[p] = *(%(xIdx)s_DTYPE *)PyArray_GETPTR2(%(xIdx)s, b, i) * o_str_0 + *(%(yIdx)s_DTYPE *)PyArray_GETPTR2(%(yIdx)s, b, j) * o_str_1 + %(out)s->ga.offset;
}
}
}
{
ga_ssize str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
ga_ssize str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
ga_ssize str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
int err;
err = blas_ops->sgerBatch(cb_fortran,
PyGpuArray_DIMS(%(y)s)[2], PyGpuArray_DIMS(%(x)s)[2],
*(float *)PyArray_GETPTR1(%(alpha)s, 0),
y_list, offY, str_y, x_list, offX, str_x, out_list, offOut, str_out,
PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[1] * PyGpuArray_DIMS(%(y)s)[1], 0);
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "sgerBatch failed");
%(fail)s
}
}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self):
return (11,)
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_headers(self):
return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
'<gpuarray_helper.h>']
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
......@@ -8,7 +8,7 @@ import theano
from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB,
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse
......@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
......@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax)
......@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpuarray', *tags)
return local_opt
return f
register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name='gpua_constant_folding')(
tensor.opt.constant_folding)
......@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node, context_name):
context = get_context(context_name)
# This is disabled on non-cuda contexts
if get_context(context_name).kind != 'cuda':
if context.kind != 'cuda':
return None
x, y, ilist = node.inputs
......@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major']
compute_capability = int(context.bin_id[-2])
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1(
......@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
"""
@register_opt('fast_compile')
@op_lifter([SparseBlockGemv])
def local_lift_sparseblockgemv(node, context_name):
return GpuSparseBlockGemv(node.op.inplace)
@register_opt('fast_compile')
@op_lifter([SparseBlockOuter])
def local_lift_sparseblockouter(node, context_name):
return GpuSparseBlockOuter(node.op.inplace)
@register_inplace()
@local_optimizer([GpuSparseBlockGemv], inplace=True)
def local_inplace_sparseblockgemv(node):
if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
@register_inplace()
@local_optimizer([GpuSparseBlockOuter], inplace=True)
def local_inplace_sparseblockouter(node):
if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile')
@op_lifter([AbstractConv2d,
......
......@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
def test_sparseblockgemv_grad_1(self):
"""
Test that we correctly handle cases where dimensions are 1.
"""
# Test that we correctly handle cases where dimensions are 1.
h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论