提交 2a5fc594 authored 作者: Maxim Kochurov's avatar Maxim Kochurov 提交者: Brandon T. Willard

Remove aesara.gpuarray

上级 b3ce3640
差异被折叠。
差异被折叠。
差异被折叠。
import logging
import numpy as np
from aesara import tensor as at
from aesara.gpuarray.basic_ops import (
as_gpuarray_variable,
gpuarray_helper_inc_dir,
infer_context_name,
)
from aesara.gpuarray.type import gpu_context_type
from aesara.gradient import grad_undefined
from aesara.graph.basic import Apply
from aesara.link.c.op import _NoPythonExternalCOp
from aesara.link.c.params_type import ParamsType
from aesara.scalar import bool as bool_t
from aesara.tensor import as_tensor_variable
from aesara.tensor.type import discrete_dtypes
_logger = logging.getLogger("aesara.gpuarray.blocksparse")
class GpuSparseBlockGemv(_NoPythonExternalCOp):
"""
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
__props__ = ("inplace",)
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
# NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.
def __init__(self, inplace=False):
super().__init__("c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_headers(self, **kwargs):
return [
"<gpuarray/buffer_blas.h>",
"<gpuarray/buffer.h>",
"<gpuarray_helper.h>",
]
def make_node(self, o, W, h, inputIdx, outputIdx):
ctx = infer_context_name(o, W, h)
o = as_gpuarray_variable(o, ctx)
W = as_gpuarray_variable(W, ctx)
h = as_gpuarray_variable(h, ctx)
inputIdx = as_tensor_variable(inputIdx)
outputIdx = as_tensor_variable(outputIdx)
assert o.ndim == 3
assert W.ndim == 4
assert h.ndim == 3
assert inputIdx.ndim == 2
assert outputIdx.ndim == 2
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
def infer_shape(self, fgraph, node, input_shapes):
return [input_shapes[0]]
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gpu_sparse_block_gemv(
h.zeros_like(), W.dimshuffle((1, 0, 3, 2)), go, outputIdx, inputIdx
)
return [
go,
Wgrad,
hgrad,
grad_undefined(self, 3, inputIdx, "grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx, "grad of outputIdx makes no sense"),
]
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class GpuSparseBlockOuter(_NoPythonExternalCOp):
"""
GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ("inplace",)
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
super().__init__(["c_code/blockger.c"], "APPLY_SPECIFIC(blockger)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
ctx = infer_context_name(o, x, y)
one = at.constant(np.asarray(1.0, dtype="float32"))
o = as_gpuarray_variable(o, ctx)
x = as_gpuarray_variable(x, ctx)
y = as_gpuarray_variable(y, ctx)
xIdx = as_tensor_variable(xIdx)
yIdx = as_tensor_variable(yIdx)
if alpha is None:
alpha = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
def infer_shape(self, fgraph, node, input_shapes):
return [input_shapes[0]]
def c_header_dirs(self, **kwargs):
return [gpuarray_helper_inc_dir()]
def c_headers(self, **kwargs):
return [
"<gpuarray/buffer_blas.h>",
"<gpuarray/buffer.h>",
"<gpuarray_helper.h>",
]
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
#section support_code_apply
int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
} else {
out = aesara_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
}
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
int err;
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
/* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(h)[1];
size_t maxj = PyGpuArray_DIMS(out)[1];
size_t maxb = PyGpuArray_DIMS(out)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = h->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
out_list[p] = out->ga.data;
offOut[p] = b * o_str_0 + j * o_str_1 + out->ga.offset;
W_list[p] = W->ga.data;
offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
*(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
W->ga.offset;
}
}
}
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
if (lda == 1) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
}
if (out->ga.typecode == GA_FLOAT) {
err = gpublas_sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = gpublas_dgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_HALF) {
err = gpublas_sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else {
err = GA_INVALID_ERROR;
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
gpudata **y_list = NULL;
size_t *offOut = NULL;
size_t *offX = NULL;
size_t *offY = NULL;
int err;
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
} else {
out = aesara_try_copy(out, o);
if (out == NULL)
return -1;
}
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (o_list == NULL || offOut == NULL ||
x_list == NULL || offX == NULL ||
y_list == NULL || offY == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = x->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
y_list[p] = y->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
o_list[p] = out->ga.data;
offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
}
}
}
ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
if (out->ga.typecode == GA_FLOAT) {
err = gpublas_sgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = gpublas_dgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(double *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_HALF) {
err = gpublas_hgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else {
err = GA_INVALID_ERROR;
}
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gerBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
cudnnStatus_t err;
int pad[3] = {params->pad0, params->pad1, params->pad2};
int strides[3] = {params->sub0, params->sub1, params->sub2};
int dilation[3] = {params->dil0, params->dil1, params->dil2};
if (params->bmode == BORDER_MODE_FULL) {
pad[0] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0];
pad[1] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1];
if (params->nb_dims > 2) {
pad[2] = (*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2];
}
} else if(params->bmode == BORDER_MODE_HALF) {
pad[0] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 2) - 1) * dilation[0] + 1) / 2;
pad[1] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 3) - 1) * dilation[1] + 1) / 2;
if (params->nb_dims > 2) {
pad[2] = ((*(npy_int64 *)PyArray_GETPTR1(filt_shp, 4) - 1) * dilation[2] + 1) / 2;
}
}
if (PyArray_DIM(filt_shp, 0) - 2 != params->nb_dims) {
PyErr_Format(PyExc_ValueError, "Filter shape has too many dimensions: "
"expected %d, got %lld.", params->nb_dims,
(long long)PyArray_DIM(filt_shp, 0));
return -1;
}
err = cudnnCreateConvolutionDescriptor(desc);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
err = cudnnSetConvolutionNdDescriptor(*desc, params->nb_dims, pad, strides,
dilation, params->conv_mode, params->precision);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not set convolution "
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
return -1;
return 0;
}
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论