Merge pull request #4367 from abergeron/gpua_blocksparse

Blocksparse for gpuarray

Merge pull request #4367 from abergeron/gpua_blocksparse
64de6998 · Frédéric Bastien · 2eba08b2 · 565a6d91 · 64de6998 · 64de6998
--- a/theano/sandbox/gpuarray/__init__.py
+++ b/theano/sandbox/gpuarray/__init__.py
@@ -42,7 +42,7 @@ register_transfer(transfer)
 def init_dev(dev, name=None):
    v = pygpu.gpuarray.api_version()
-    if v[0] != -9999:
+    if v[0] != -9998:
        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
                           "Make sure Theano and libgpuarray/pygpu "
                           "are in sync.")

--- a/theano/sandbox/gpuarray/blockgemv.c
+++ b/theano/sandbox/gpuarray/blockgemv.c
+#section support_code_apply
+int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
+                              PyGpuArrayObject *h, PyArrayObject *inputIdx,
+                              PyArrayObject *outputIdx,
+                              PyGpuArrayObject **_out,
+                              PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL) {
+    // Error already set
+    return -1;
+  }
+#endif
+  gpudata **W_list = NULL;
+  gpudata **inp_list = NULL;
+  gpudata **out_list = NULL;
+  size_t *offW = NULL;
+  size_t *offInp = NULL;
+  size_t *offOut = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+  /* Prepare lists for the batch */
+  size_t maxi = PyGpuArray_DIMS(h)[1];
+  size_t maxj = PyGpuArray_DIMS(out)[1];
+  size_t maxb = PyGpuArray_DIMS(out)[0];
+  ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
+  ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
+  ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
+  ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
+  W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (W_list == NULL || offW == NULL ||
+      inp_list == NULL || offInp == NULL ||
+      out_list == NULL || offOut == NULL) {
+    free(W_list);
+    free(offW);
+    free(inp_list);
+    free(offInp);
+    free(out_list);
+    free(offOut);
+    PyErr_NoMemory();
+    return -1;
+  }
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        inp_list[p] = h->ga.data;
+        offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
+        out_list[p] = out->ga.data;
+        offOut[p] = b * o_str_0 + j * o_str_1 + out->ga.offset;
+        W_list[p] = W->ga.data;
+        offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
+          *(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
+          W->ga.offset;
+      }
+    }
+  }
+  cb_transpose transA = cb_no_trans;
+  size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
+  if (lda == 1) {
+    transA = cb_trans;
+    lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
+  }
+  if (out->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(out)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+  } else if (out->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(out)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+  } else if (out->ga.typecode == GA_HALF) {
+    err = blas_ops->sgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(out)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+  } else {
+    err = GA_INVALID_ERROR;
+  }
+  free(W_list);
+  free(offW);
+  free(inp_list);
+  free(offInp);
+  free(out_list);
+  free(offOut);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
--- a/theano/sandbox/gpuarray/blockger.c
+++ b/theano/sandbox/gpuarray/blockger.c
+#section support_code_apply
+int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
+                             PyGpuArrayObject *y, PyArrayObject *xIdx,
+                             PyArrayObject *yIdx, PyArrayObject *alpha,
+                             PyGpuArrayObject **_out,
+                             PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+  gpudata **o_list = NULL;
+  gpudata **x_list = NULL;
+  gpudata **y_list = NULL;
+  size_t *offOut = NULL;
+  size_t *offX = NULL;
+  size_t *offY = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL)
+    return -1;
+#endif
+  size_t maxi = PyGpuArray_DIMS(x)[1];
+  size_t maxj = PyGpuArray_DIMS(y)[1];
+  size_t maxb = PyGpuArray_DIMS(x)[0];
+  ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
+  ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
+  ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
+  ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
+  o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (o_list == NULL || offOut == NULL ||
+      x_list == NULL || offX == NULL ||
+      y_list == NULL || offY == NULL) {
+    free(o_list);
+    free(offOut);
+    free(x_list);
+    free(offX);
+    free(y_list);
+    free(offY);
+    PyErr_NoMemory();
+    return -1;
+  }
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        x_list[p] = x->ga.data;
+        offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
+        y_list[p] = y->ga.data;
+        offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
+        o_list[p] = out->ga.data;
+        offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
+      }
+    }
+  }
+  ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
+  ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
+  ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
+  if (out->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(float *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else if (out->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(double *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else if (out->ga.typecode == GA_HALF) {
+    err = blas_ops->hgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(float *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else {
+    err = GA_INVALID_ERROR;
+  }
+  free(o_list);
+  free(offOut);
+  free(x_list);
+  free(offX);
+  free(y_list);
+  free(offY);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "gerBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
--- a/theano/sandbox/gpuarray/blocksparse.py
+++ b/theano/sandbox/gpuarray/blocksparse.py
+from __future__ import absolute_import, print_function, division
+import logging
+import os
+import numpy
+from theano import Apply, tensor
+from theano.gof import COp
+from theano.tensor import discrete_dtypes, as_tensor_variable
+from theano.gradient import grad_undefined
+from .type import gpu_context_type
+from .basic_ops import as_gpuarray_variable, infer_context_name
+_logger = logging.getLogger('theano.sandbox.gpuarray.blocksparse')
+class GpuSparseBlockGemv(COp):
+    """
+    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
+    information.
+    This should not be directly called since the interface is subject
+    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
+    function for a stable interface.
+    """
+    __props__ = ('inplace',)
+    params_type = gpu_context_type
+    def __init__(self, inplace=False):
+        COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def c_headers(self):
+        return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
+                '<gpuarray_helper.h>']
+    def make_node(self, o, W, h, inputIdx, outputIdx):
+        ctx = infer_context_name(o, W, h)
+        o = as_gpuarray_variable(o, ctx)
+        W = as_gpuarray_variable(W, ctx)
+        h = as_gpuarray_variable(h, ctx)
+        inputIdx = as_tensor_variable(inputIdx)
+        outputIdx = as_tensor_variable(outputIdx)
+        assert o.ndim == 3
+        assert W.ndim == 4
+        assert h.ndim == 3
+        assert inputIdx.ndim == 2
+        assert outputIdx.ndim == 2
+        assert inputIdx.type.dtype in discrete_dtypes
+        assert outputIdx.type.dtype in discrete_dtypes
+        return Apply(self, [o, W, h, inputIdx, outputIdx],
+                     [o.type()])
+    def infer_shape(self, node, input_shapes):
+        return [input_shapes[0]]
+    def grad(self, inputs, grads):
+        o, W, h, inputIdx, outputIdx = inputs
+        go = grads[0]
+        Wgrad = gpu_sparse_block_outer(W.zeros_like(),
+                                       h, go, inputIdx, outputIdx)
+        hgrad = gpu_sparse_block_gemv(h.zeros_like(),
+                                      W.dimshuffle((1, 0, 3, 2)),
+                                      go,
+                                      outputIdx, inputIdx)
+        return [go, Wgrad, hgrad,
+                grad_undefined(self, 3, inputIdx,
+                               "grad of inputIdx makes no sense"),
+                grad_undefined(self, 4, outputIdx,
+                               "grad of outputIdx makes no sense")]
+gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
+gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
+class GpuSparseBlockOuter(COp):
+    """
+    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
+    information.
+    This op should not be called directly since its interface is
+    subject to change without notice.  It is involved in the gradient
+    of GpuSparseBlockGemv. The gradient is not implemented.
+    """
+    __props__ = ('inplace',)
+    params_type = gpu_context_type
+    def __init__(self, inplace=False):
+        COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        ctx = infer_context_name(o, x, y)
+        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
+        o = as_gpuarray_variable(o, ctx)
+        x = as_gpuarray_variable(x, ctx)
+        y = as_gpuarray_variable(y, ctx)
+        xIdx = as_tensor_variable(xIdx)
+        yIdx = as_tensor_variable(yIdx)
+        if alpha is None:
+            alpha = one
+        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
+                     [o.type()])
+    def infer_shape(self, node, input_shapes):
+        return [input_shapes[0]]
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def c_headers(self):
+        return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
+                '<gpuarray_helper.h>']
+gpu_sparse_block_outer = GpuSparseBlockOuter(False)
+gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -8,7 +8,7 @@ import theano
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.gof import (local_optimizer, EquilibriumDB,
+from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, toolbox)
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs)
@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemmbatch_no_inplace)
+from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
        return local_opt
    return f
+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpuarray', *tags)
+        return local_opt
+    return f
 register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 register_opt(final_opt=True, name='gpua_constant_folding')(
    tensor.opt.constant_folding)
@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node, context_name):
+    context = get_context(context_name)
    # This is disabled on non-cuda contexts
-    if get_context(context_name).kind != 'cuda':
+    if context.kind != 'cuda':
        return None
    x, y, ilist = node.inputs
@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
            y = tensor.cast(y, dtype)
    set_instead_of_inc = node.op.set_instead_of_inc
-    active_device_no = theano.sandbox.cuda.active_device_number()
-    device_properties = theano.sandbox.cuda.device_properties
-    compute_capability = device_properties(active_device_no)['major']
+    compute_capability = int(context.bin_id[-2])
    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
        return GpuAdvancedIncSubtensor1(
@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
 """
+@register_opt('fast_compile')
+@op_lifter([SparseBlockGemv])
+def local_lift_sparseblockgemv(node, context_name):
+    return GpuSparseBlockGemv(node.op.inplace)
+@register_opt('fast_compile')
+@op_lifter([SparseBlockOuter])
+def local_lift_sparseblockouter(node, context_name):
+    return GpuSparseBlockOuter(node.op.inplace)
+@register_inplace()
+@local_optimizer([GpuSparseBlockGemv], inplace=True)
+def local_inplace_sparseblockgemv(node):
+    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
+        return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
+@register_inplace()
+@local_optimizer([GpuSparseBlockOuter], inplace=True)
+def local_inplace_sparseblockouter(node):
+    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
+        return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
 # This deals with any abstract convs that have a transfer somewhere
 @register_opt('fast_compile')
 @op_lifter([AbstractConv2d,

--- a/theano/sandbox/gpuarray/tests/test_blocksparse.py
+++ b/theano/sandbox/gpuarray/tests/test_blocksparse.py
+from __future__ import absolute_import, print_function, division
+import numpy
+import theano
+from theano import tensor
+import theano.tests.unittest_tools as utt
+from theano.tensor.nnet.tests import test_blocksparse
+from .config import mode_with_gpu, test_ctx_name
+from ..type import gpuarray_shared_constructor
+from ..blocksparse import (GpuSparseBlockGemv,
+                           GpuSparseBlockOuter,
+                           gpu_sparse_block_gemv,
+                           gpu_sparse_block_outer)
+class BlockSparse_Gemv_and_Outer(test_blocksparse.BlockSparse_Gemv_and_Outer):
+    def setUp(self):
+        utt.seed_rng()
+        self.mode = mode_with_gpu.excluding('constant_folding')
+        self.gemv_op = gpu_sparse_block_gemv
+        self.outer_op = gpu_sparse_block_outer
+        self.gemv_class = GpuSparseBlockGemv
+        self.outer_class = GpuSparseBlockOuter
+    # This test is temporarily disabled since we disabled the output_merge
+    # and alpha_merge optimizations for blocksparse due to brokeness.
+    # Re-enable when those are re-added.
+    def Xtest_blocksparse_grad_merge(self):
+        b = tensor.fmatrix()
+        h = tensor.ftensor3()
+        iIdx = tensor.lmatrix()
+        oIdx = tensor.lmatrix()
+        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
+        W = gpuarray_shared_constructor(W_val, context=test_ctx_name)
+        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
+        gW = theano.grad(o.sum(), W)
+        lr = numpy.asarray(0.05, dtype='float32')
+        upd = W - lr * gW
+        f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
+                             mode=mode_with_gpu)
+        # Make sure the lr update was merged.
+        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
+                          GpuSparseBlockOuter)
+        # Exclude the merge optimizations.
+        mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
+        mode = mode.excluding('local_merge_blocksparse_output')
+        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
+        # Make sure the lr update is not merged.
+        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
+                              GpuSparseBlockOuter)
+        f2(h_val, iIdx_val, b_val, oIdx_val)
+        W_ref = W.get_value()
+        # reset the var
+        W.set_value(W_val)
+        f1(h_val, iIdx_val, b_val, oIdx_val)
+        W_opt = W.get_value()
+        utt.assert_allclose(W_ref, W_opt)
--- a/theano/tensor/nnet/tests/test_blocksparse.py
+++ b/theano/tensor/nnet/tests/test_blocksparse.py
@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
    def test_sparseblockgemv_grad_1(self):
-        """
+        # Test that we correctly handle cases where dimensions are 1.
-            Test that we correctly handle cases where dimensions are 1.
-        """
        h_val = randn(1, 1, 1).astype('float32')
        iIdx_val = numpy.random.permutation(1)[:1][None, :]
        oIdx_val = numpy.random.permutation(1)[:1][None, :]