提交 64de6998 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #4367 from abergeron/gpua_blocksparse

Blocksparse for gpuarray
...@@ -42,7 +42,7 @@ register_transfer(transfer) ...@@ -42,7 +42,7 @@ register_transfer(transfer)
def init_dev(dev, name=None): def init_dev(dev, name=None):
v = pygpu.gpuarray.api_version() v = pygpu.gpuarray.api_version()
if v[0] != -9999: if v[0] != -9998:
raise RuntimeError("Wrong major API version for gpuarray:", v[0], raise RuntimeError("Wrong major API version for gpuarray:", v[0],
"Make sure Theano and libgpuarray/pygpu " "Make sure Theano and libgpuarray/pygpu "
"are in sync.") "are in sync.")
......
#section support_code_apply
int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
#endif
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
gpudata **out_list = NULL;
size_t *offW = NULL;
size_t *offInp = NULL;
size_t *offOut = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
/* Prepare lists for the batch */
size_t maxi = PyGpuArray_DIMS(h)[1];
size_t maxj = PyGpuArray_DIMS(out)[1];
size_t maxb = PyGpuArray_DIMS(out)[0];
ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (W_list == NULL || offW == NULL ||
inp_list == NULL || offInp == NULL ||
out_list == NULL || offOut == NULL) {
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
inp_list[p] = h->ga.data;
offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
out_list[p] = out->ga.data;
offOut[p] = b * o_str_0 + j * o_str_1 + out->ga.offset;
W_list[p] = W->ga.data;
offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
*(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
W->ga.offset;
}
}
}
cb_transpose transA = cb_no_trans;
size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
if (lda == 1) {
transA = cb_trans;
lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
}
if (out->ga.typecode == GA_FLOAT) {
err = blas_ops->sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else if (out->ga.typecode == GA_HALF) {
err = blas_ops->sgemvBatch(cb_fortran, transA,
PyGpuArray_DIMS(out)[2],
PyGpuArray_DIMS(h)[2], 1,
W_list, offW, lda,
inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
} else {
err = GA_INVALID_ERROR;
}
free(W_list);
free(offW);
free(inp_list);
free(offInp);
free(out_list);
free(offOut);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
return -1;
}
*_out = out;
return 0;
}
#section support_code_apply
int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
gpudata **y_list = NULL;
size_t *offOut = NULL;
size_t *offX = NULL;
size_t *offY = NULL;
gpuarray_blas_ops *blas_ops;
int err;
err = ctx->ops->property(ctx->ctx, NULL, NULL,
GA_CTX_PROP_BLAS_OPS, &blas_ops);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
return -1;
}
err = blas_ops->setup(ctx->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
#ifdef INPLACE
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
out = theano_try_copy(out, o);
if (out == NULL)
return -1;
#endif
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
if (o_list == NULL || offOut == NULL ||
x_list == NULL || offX == NULL ||
y_list == NULL || offY == NULL) {
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
PyErr_NoMemory();
return -1;
}
for (size_t i = 0; i < maxi; i++) {
for (size_t j = 0; j < maxj; j++) {
for (size_t b = 0; b < maxb; b++) {
size_t p = i + j * maxi + b * maxi * maxj;
x_list[p] = x->ga.data;
offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
y_list[p] = y->ga.data;
offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
o_list[p] = out->ga.data;
offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
}
}
}
ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
if (out->ga.typecode == GA_FLOAT) {
err = blas_ops->sgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_DOUBLE) {
err = blas_ops->dgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(double *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else if (out->ga.typecode == GA_HALF) {
err = blas_ops->hgerBatch(cb_fortran,
PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
*(float *)PyArray_GETPTR1(alpha, 0),
y_list, offY, str_y, x_list, offX, str_x,
o_list, offOut, str_out,
PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
} else {
err = GA_INVALID_ERROR;
}
free(o_list);
free(offOut);
free(x_list);
free(offX);
free(y_list);
free(offY);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "gerBatch failed");
return -1;
}
*_out = out;
return 0;
}
from __future__ import absolute_import, print_function, division
import logging
import os
import numpy
from theano import Apply, tensor
from theano.gof import COp
from theano.tensor import discrete_dtypes, as_tensor_variable
from theano.gradient import grad_undefined
from .type import gpu_context_type
from .basic_ops import as_gpuarray_variable, infer_context_name
_logger = logging.getLogger('theano.sandbox.gpuarray.blocksparse')
class GpuSparseBlockGemv(COp):
"""
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_headers(self):
return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
'<gpuarray_helper.h>']
def make_node(self, o, W, h, inputIdx, outputIdx):
ctx = infer_context_name(o, W, h)
o = as_gpuarray_variable(o, ctx)
W = as_gpuarray_variable(W, ctx)
h = as_gpuarray_variable(h, ctx)
inputIdx = as_tensor_variable(inputIdx)
outputIdx = as_tensor_variable(outputIdx)
assert o.ndim == 3
assert W.ndim == 4
assert h.ndim == 3
assert inputIdx.ndim == 2
assert outputIdx.ndim == 2
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
return Apply(self, [o, W, h, inputIdx, outputIdx],
[o.type()])
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = gpu_sparse_block_outer(W.zeros_like(),
h, go, inputIdx, outputIdx)
hgrad = gpu_sparse_block_gemv(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)),
go,
outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class GpuSparseBlockOuter(COp):
"""
GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
ctx = infer_context_name(o, x, y)
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = as_gpuarray_variable(o, ctx)
x = as_gpuarray_variable(x, ctx)
y = as_gpuarray_variable(y, ctx)
xIdx = as_tensor_variable(xIdx)
yIdx = as_tensor_variable(yIdx)
if alpha is None:
alpha = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[o.type()])
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_headers(self):
return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
'<gpuarray_helper.h>']
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
...@@ -8,7 +8,7 @@ import theano ...@@ -8,7 +8,7 @@ import theano
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, toolbox)
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse from theano.ifelse import IfElse
...@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast ...@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.tensor.nnet.abstract_conv import (AbstractConv2d, from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs) AbstractConv2d_gradInputs)
...@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemmbatch_no_inplace) gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmaxWithBias, GpuSoftmax) GpuSoftmaxWithBias, GpuSoftmax)
...@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs): ...@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
return local_opt return local_opt
return f return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpuarray', *tags)
return local_opt
return f
register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i) register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
register_opt(final_opt=True, name='gpua_constant_folding')( register_opt(final_opt=True, name='gpua_constant_folding')(
tensor.opt.constant_folding) tensor.opt.constant_folding)
...@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name): ...@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node, context_name): def local_gpua_advanced_incsubtensor(node, context_name):
context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if get_context(context_name).kind != 'cuda': if context.kind != 'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = node.inputs
...@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name): ...@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
y = tensor.cast(y, dtype) y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major'] compute_capability = int(context.bin_id[-2])
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1( return GpuAdvancedIncSubtensor1(
...@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d() ...@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
""" """
@register_opt('fast_compile')
@op_lifter([SparseBlockGemv])
def local_lift_sparseblockgemv(node, context_name):
return GpuSparseBlockGemv(node.op.inplace)
@register_opt('fast_compile')
@op_lifter([SparseBlockOuter])
def local_lift_sparseblockouter(node, context_name):
return GpuSparseBlockOuter(node.op.inplace)
@register_inplace()
@local_optimizer([GpuSparseBlockGemv], inplace=True)
def local_inplace_sparseblockgemv(node):
if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
@register_inplace()
@local_optimizer([GpuSparseBlockOuter], inplace=True)
def local_inplace_sparseblockouter(node):
if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
# This deals with any abstract convs that have a transfer somewhere # This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
......
from __future__ import absolute_import, print_function, division
import numpy
import theano
from theano import tensor
import theano.tests.unittest_tools as utt
from theano.tensor.nnet.tests import test_blocksparse
from .config import mode_with_gpu, test_ctx_name
from ..type import gpuarray_shared_constructor
from ..blocksparse import (GpuSparseBlockGemv,
GpuSparseBlockOuter,
gpu_sparse_block_gemv,
gpu_sparse_block_outer)
class BlockSparse_Gemv_and_Outer(test_blocksparse.BlockSparse_Gemv_and_Outer):
def setUp(self):
utt.seed_rng()
self.mode = mode_with_gpu.excluding('constant_folding')
self.gemv_op = gpu_sparse_block_gemv
self.outer_op = gpu_sparse_block_outer
self.gemv_class = GpuSparseBlockGemv
self.outer_class = GpuSparseBlockOuter
# This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge(self):
b = tensor.fmatrix()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
W = gpuarray_shared_constructor(W_val, context=test_ctx_name)
o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
gW = theano.grad(o.sum(), W)
lr = numpy.asarray(0.05, dtype='float32')
upd = W - lr * gW
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu)
# Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
GpuSparseBlockOuter)
# Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output')
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
GpuSparseBlockOuter)
f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value()
# reset the var
W.set_value(W_val)
f1(h_val, iIdx_val, b_val, oIdx_val)
W_opt = W.get_value()
utt.assert_allclose(W_ref, W_opt)
...@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester): ...@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps) utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
def test_sparseblockgemv_grad_1(self): def test_sparseblockgemv_grad_1(self):
""" # Test that we correctly handle cases where dimensions are 1.
Test that we correctly handle cases where dimensions are 1.
"""
h_val = randn(1, 1, 1).astype('float32') h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :] iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :] oIdx_val = numpy.random.permutation(1)[:1][None, :]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论