Add batch support to blocksparse.

42f4cb3e · Arnaud Bergeron · 47d59687 · 42f4cb3e · 42f4cb3e
--- a/theano/sandbox/cuda/blocksparse.py
+++ b/theano/sandbox/cuda/blocksparse.py
@@ -74,24 +74,29 @@ def ger(alpha, x, y, A):


 class SparseBlockGemvSS(GpuOp):
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+
    def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and self.inplace == other.inplace

    def __hash__(self):
-        return hash(type(self))
+        return hash(type(self)) ^ hash(self.inplace)

    def __str__(self):
-        return "SparseBlockGemvSS"
+        return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "")

    def make_node(self, o, W, h, inputIdx, outputIdx):
        o = basic_ops.as_cuda_ndarray_variable(o)
        W = basic_ops.as_cuda_ndarray_variable(W)
        h = basic_ops.as_cuda_ndarray_variable(h)
-        assert o.ndim == 2
+        assert o.ndim == 3
        assert W.ndim == 4
-        assert h.ndim == 2
-        assert inputIdx.ndim == 1
-        assert outputIdx.ndim == 1
+        assert h.ndim == 3
+        assert inputIdx.ndim == 2
+        assert outputIdx.ndim == 2

        assert 'int' in inputIdx.type.dtype
        assert 'int' in outputIdx.type.dtype
@@ -101,7 +106,6 @@ class SparseBlockGemvSS(GpuOp):

    def c_support_code(self):
        return """
-        // This is NOT batch-ready
        __global__ void
        SparseBlockGemv_fill_lists(
 int n,
@@ -109,18 +113,38 @@ const float **inp_list,
 float **out_list,
 const float **W_list,
 const float *W, int W_str_0, int W_str_1,
-const float *h, int h_str_0,
-float *outB, int o_str_0, int o_str_1,
-const npy_intp *iIdx,
-const npy_intp *oIdx
+const float *h, int h_str_0, int h_str_1,
+float *outB, int o_str_0, int o_str_1, int o_str_2,
+const npy_intp *iIdx, int iI_str_0,
+const npy_intp *oIdx, int oI_str_0
        ) {
          int i = threadIdx.x + blockDim.x * blockIdx.x;
          int j = threadIdx.y + blockDim.y * blockIdx.y;
-          int p = i + j * blockDim.x * gridDim.x;
+          int b = threadIdx.z + blockDim.z * blockIdx.z;
+          int p = i + j * blockDim.x * gridDim.x +
+                  b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
          if (p >= n) return;
-          inp_list[p] = &h[i * h_str_0];
-          out_list[p] = &outB[i * o_str_0 + j * o_str_1];
-          W_list[p] = &W[iIdx[i] * W_str_0 + oIdx[j] * W_str_1];
+          inp_list[p] = &h[b * h_str_0 + i * h_str_1];
+          out_list[p] = &outB[b * o_str_0 + i * o_str_1 + j * o_str_2];
+          W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
+                         oIdx[b*oI_str_0+j] * W_str_1];
+        }
+
+        __global__ void
+        SparseBlockGemv_reduce(
+int red_dim,
+float *outB, int i_str_0, int i_str_1, int i_str_2, int i_str_3,
+float *out, int o_str_0, int o_str_1, int o_str_2
+        ) {
+          int i = threadIdx.x + blockDim.x * blockIdx.x;
+          int j = threadIdx.y + blockDim.y * blockIdx.y;
+          int b = threadIdx.z + blockDim.z * blockIdx.z;
+          float s = 0.0;
+          float *oB = &outB[b * i_str_0 + i * i_str_2 + j * i_str_3];
+          for (int k = 0; k < red_dim; k++) {
+            s += oB[k * i_str_1];
+          }
+          out[b * o_str_0 + i * o_str_1 + j * o_str_2] += s;
        }

        static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
@@ -152,7 +176,6 @@ const npy_intp *oIdx
        static npy_intp *%(n)s_oIdx;
        static size_t %(n)s_oIdx_len;

-        // This is batch-ready
        static int %(n)s_prep(int b, int i, int j, int outsize) {
          int s = b*i*j;
          if (%(n)s_list_len < s) {
@@ -187,22 +210,25 @@ const npy_intp *oIdx
        o, W, h, inputIdx, outputIdx = inputs
        out = outputs[0]

-        dd = (o.shape[0] * h.shape[0],)
+        dd = (o.shape[0] * o.shape[1] * h.shape[1],)
        weightHostB = numpy.empty(dd, dtype='intp')
        outputHostB = numpy.empty(dd, dtype='intp')
        inputHostB = numpy.empty(dd, dtype='intp')

-        outputBatched = pycuda.gpuarray.GPUArray((h.shape[0], o.shape[0], o.shape[1]), dtype='float32')
+        outputBatched = pycuda.gpuarray.GPUArray((h.shape[0], h.shape[1],
+                                                  o.shape[1], o.shape[2]),
+                                                 dtype='float32')

        k = 0
-        for j in range(o.shape[0]):
-            out_id = outputIdx[j]
-            for i in range(h.shape[0]):
-                inp_id = inputIdx[i]
-                weightHostB[k] = W[inp_id, out_id].gpudata
-                outputHostB[k] = outputBatched[i, j].ptr
-                inputHostB[k] = h[i].gpudata
-                k += 1
+        for b in range(o.shape[0]):
+            for j in range(o.shape[1]):
+                out_id = outputIdx[b, j]
+                for i in range(h.shape[1]):
+                    inp_id = inputIdx[b, i]
+                    weightHostB[k] = W[inp_id, out_id].gpudata
+                    outputHostB[k] = outputBatched[b, i, j].ptr
+                    inputHostB[k] = h[b, i].gpudata
+                    k += 1

        weightB = pycuda.gpuarray.to_gpu(weightHostB)
        inputB = pycuda.gpuarray.to_gpu(inputHostB)
@@ -215,13 +241,13 @@ const npy_intp *oIdx
            lda = W.strides[3]


-        gemm_batched(tA, 'n', o.shape[1], 1, h.shape[1],
-                     weightB, lda, inputB, h.strides[0],
-                     outputB, o.strides[0],
+        gemm_batched(tA, 'n', o.shape[2], 1, h.shape[2],
+                     weightB, lda, inputB, h.strides[1],
+                     outputB, o.strides[1],
                     beta=numpy.asarray(0.0, dtype='float32'))

        outputBatchedG = to_cudandarray(outputBatched)
-        out[0] = o + outputBatchedG.reduce_sum([1, 0, 0])
+        out[0] = o + outputBatchedG.reduce_sum([0, 1, 0, 0])

    def infer_shape(self, node, input_shapes):
        return [input_shapes[0]]
@@ -230,27 +256,34 @@ const npy_intp *oIdx
        o, W, h, inputIdx, outputIdx = inputs
        out = outputs[0]

-        return """
-        if (%(name)s_prep(1, // NOT batch-ready
-                          CudaNdarray_HOST_DIMS(%(h)s)[0],
-                          CudaNdarray_HOST_DIMS(%(o)s)[0],
-                          CudaNdarray_HOST_DIMS(%(o)s)[1]) == -1) {
+        if self.inplace:
+            res = """
+Py_XDECREF(%(out)s);
+%(out)s = %(o)s;
+Py_INCREF(%(out)s);
+""" % dict(out=out, o=o)
+        else:
+            res = """
+if (CudaNdarray_prep_output(&%(out)s, 3, CudaNdarray_HOST_DIMS(%(o)s)))
+{
+  PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
+  %(fail)s
+}
+if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
+  PyErr_SetString(PyExc_RuntimeError, "Cannot copy data to output");
+  %(fail)s
+}
+""" % dict(out=out, o=o, fail=sub['fail'])
+
+        return res + """
+        if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(o)s)[0],
+                          CudaNdarray_HOST_DIMS(%(h)s)[1],
+                          CudaNdarray_HOST_DIMS(%(o)s)[1],
+                          CudaNdarray_HOST_DIMS(%(o)s)[2]) == -1) {
          PyErr_SetString(PyExc_RuntimeError,
                          "Could not allocate working memory.");
          %(fail)s
        }
-        {
-          // NOT batch-ready
-          int dims[3];
-          dims[0] = 1; // This is to facilitate the reduction at the end.
-          dims[1] = CudaNdarray_HOST_DIMS(%(o)s)[0];
-          dims[2] = CudaNdarray_HOST_DIMS(%(o)s)[1];
-          if (CudaNdarray_prep_output(&%(out)s, 3, dims)) {
-            PyErr_SetString(PyExc_RuntimeError, "Cannot allocate output");
-            %(fail)s
-          }
-        }
-        // This is batch-ready
        if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
          { %(fail)s }
        if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
@@ -258,21 +291,23 @@ const npy_intp *oIdx
        { /* Prepare lists for the batch */
          // NOT batch-ready
          dim3 block;
-          block.x = CudaNdarray_HOST_DIMS(%(h)s)[0];
-          block.y = CudaNdarray_HOST_DIMS(%(o)s)[0];
+          block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
+          block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
+          block.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
          SparseBlockGemv_fill_lists<<<block, 1>>>(
-block.x*block.y,
+block.x*block.y*block.z,
 %(name)s_inp_list,
 %(name)s_out_list,
 %(name)s_W_list,
 CudaNdarray_DEV_DATA(%(W)s),
 CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
-CudaNdarray_DEV_DATA(%(h)s), CudaNdarray_HOST_STRIDES(%(h)s)[0],
+CudaNdarray_DEV_DATA(%(h)s), CudaNdarray_HOST_STRIDES(%(h)s)[0], CudaNdarray_HOST_STRIDES(%(h)s)[1],
 %(name)s_outB,
-CudaNdarray_HOST_DIMS(%(o)s)[0] * CudaNdarray_HOST_DIMS(%(o)s)[1],
-CudaNdarray_HOST_DIMS(%(o)s)[1],
-%(name)s_iIdx,
-%(name)s_oIdx);
+CudaNdarray_HOST_DIMS(%(h)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[2],
+CudaNdarray_HOST_DIMS(%(o)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[2],
+CudaNdarray_HOST_DIMS(%(o)s)[2],
+%(name)s_iIdx, PyArray_DIM(%(inputIdx)s, 1),
+%(name)s_oIdx, PyArray_DIM(%(outputIdx)s, 1));
        }
        { /* Run SgemmBatched */
          float alpha = 1.0;
@@ -285,50 +320,46 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
            lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
          }
          err = cublasSgemmBatched(handle, transA, CUBLAS_OP_N,
-                                   CudaNdarray_HOST_DIMS(%(o)s)[1], 1,
-                                   CudaNdarray_HOST_DIMS(%(h)s)[1], &alpha,
+                                   CudaNdarray_HOST_DIMS(%(o)s)[2], 1,
+                                   CudaNdarray_HOST_DIMS(%(h)s)[2], &alpha,
                                   %(name)s_W_list, lda, %(name)s_inp_list,
-                                   CudaNdarray_HOST_STRIDES(%(h)s)[0],
+                                   CudaNdarray_HOST_STRIDES(%(h)s)[1],
                                   &beta, %(name)s_out_list,
-                                   CudaNdarray_HOST_STRIDES(%(o)s)[0],
-                                   CudaNdarray_HOST_DIMS(%(o)s)[0] *
-                                   CudaNdarray_HOST_DIMS(%(h)s)[0]);
+                                   CudaNdarray_HOST_STRIDES(%(o)s)[1],
+                                   CudaNdarray_HOST_DIMS(%(o)s)[1] *
+                                   CudaNdarray_HOST_DIMS(%(h)s)[1] *
+                                   CudaNdarray_HOST_DIMS(%(o)s)[0]);
          if (err != CUBLAS_STATUS_SUCCESS) {
            PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
            %(fail)s
          }
        }
        { /* Perform final reduction and add biases */
-          CudaNdarray *tmp;
-          int p[2];
-          p[0] = 1;
-          p[1] = 2;
-          tmp = (CudaNdarray *)CudaNdarray_new_nd(3);
-          if (tmp == NULL) { %(fail)s }
-          CudaNdarray_set_dim(tmp, 0, CudaNdarray_HOST_DIMS(%(h)s)[0]);
-          CudaNdarray_set_stride(tmp, 0, CudaNdarray_HOST_DIMS(%(o)s)[0] *
-                                 CudaNdarray_HOST_DIMS(%(o)s)[1]);
-          CudaNdarray_set_dim(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[0]);
-          CudaNdarray_set_stride(tmp, 1, CudaNdarray_HOST_DIMS(%(o)s)[1]);
-          CudaNdarray_set_dim(tmp, 2, CudaNdarray_HOST_DIMS(%(o)s)[1]);
-          CudaNdarray_set_stride(tmp, 2, 1);
-          CudaNdarray_set_device_data(tmp, %(name)s_outB, (PyObject *)NULL);
-          if (CudaNdarray_reduce_sum(%(out)s, tmp) ||
-              CudaNdarray_dimshuffle(%(out)s, 2, p)) {
-            Py_DECREF(tmp);
-            %(fail)s;
-          }
-          Py_DECREF(tmp);
-          if (CudaNdarray_inplace_add((PyObject *)%(out)s, (PyObject *)%(o)s) == NULL) {
-            %(fail)s;
-          }
+          dim3 block;
+          block.x = CudaNdarray_HOST_DIMS(%(o)s)[1];
+          block.y = CudaNdarray_HOST_DIMS(%(o)s)[2];
+          block.z = CudaNdarray_HOST_DIMS(%(o)s)[0];
+          SparseBlockGemv_reduce<<<block, 1>>>(
+CudaNdarray_HOST_DIMS(%(h)s)[1],
+%(name)s_outB,
+CudaNdarray_HOST_DIMS(%(h)s)[1] *
+CudaNdarray_HOST_DIMS(%(o)s)[1] *
+CudaNdarray_HOST_DIMS(%(o)s)[2],
+CudaNdarray_HOST_DIMS(%(o)s)[1] *
+CudaNdarray_HOST_DIMS(%(o)s)[2],
+CudaNdarray_HOST_DIMS(%(o)s)[2],
+1,
+CudaNdarray_DEV_DATA(%(out)s),
+CudaNdarray_HOST_STRIDES(%(out)s)[0],
+CudaNdarray_HOST_STRIDES(%(out)s)[1],
+CudaNdarray_HOST_STRIDES(%(out)s)[2]);
        }
        // And we're done!
        """ % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
                   W=W, fail=sub['fail'], name=nodename)

    def c_code_cache_version(self):
-        return (3,)
+        return (5,)

    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
@@ -348,7 +379,8 @@ CudaNdarray_HOST_DIMS(%(o)s)[1],
                               "grad of outputIdx makes no sense")]


-sparse_block_gemv_ss = SparseBlockGemvSS()
+sparse_block_gemv_ss = SparseBlockGemvSS(False)
+sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)


 class SparseBlockOuterSS(GpuOp):
@@ -385,27 +417,28 @@ class SparseBlockOuterSS(GpuOp):
        if not self.inplace:
            o = o.copy()

-        dd = (x.shape[0] * y.shape[0],)
+        dd = (x.shape[0] * x.shape[1] * y.shape[1],)
        xHostB = numpy.empty(dd, dtype='intp')
        yHostB = numpy.empty(dd, dtype='intp')
        outHostB = numpy.empty(dd, dtype='intp')

        k = 0
-        for j in range(y.shape[0]):
-            out_id = yIdx[j]
-            for i in range(x.shape[0]):
-                inp_id = xIdx[i]
-                outHostB[k] = o[inp_id, out_id].gpudata
-                xHostB[k] = x[i].gpudata
-                yHostB[k] = y[j].gpudata
-                k += 1
+        for b in range(x.shape[0]):
+            for j in range(y.shape[1]):
+                out_id = yIdx[b, j]
+                for i in range(x.shape[1]):
+                    inp_id = xIdx[b, i]
+                    outHostB[k] = o[inp_id, out_id].gpudata
+                    xHostB[k] = x[b, i].gpudata
+                    yHostB[k] = y[b, j].gpudata
+                    k += 1

        xB = pycuda.gpuarray.to_gpu(xHostB)
        yB = pycuda.gpuarray.to_gpu(yHostB)
        outB = pycuda.gpuarray.to_gpu(outHostB)

-        gemm_batched('n', 't', y.shape[1], x.shape[1], 1,
-                     yB, y.strides[0], xB, x.strides[0],
+        gemm_batched('n', 't', y.shape[2], x.shape[2], 1,
+                     yB, y.strides[1], xB, x.strides[1],
                     outB, o.strides[2],
                     alpha=alpha, beta=beta)

@@ -422,19 +455,22 @@ int n,
 const float **x_list,
 const float **y_list,
 float **out_list,
-const float *x, int x_str_0,
-const float *y, int y_str_0,
+const float *x, int x_str_0, int x_str_1,
+const float *y, int y_str_0, int y_str_1,
 float *out, int o_str_0, int o_str_1,
-const npy_intp *xIdx,
-const npy_intp *yIdx
+const npy_intp *xIdx, int xI_str_0,
+const npy_intp *yIdx, int yI_str_0
 ) {
  int i = threadIdx.x + blockDim.x * blockIdx.x;
  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int p = i + j * blockDim.x * gridDim.x;
+  int b = threadIdx.z + blockDim.z * blockIdx.z;
+  int p = i + j * blockDim.x * gridDim.x +
+          b * blockDim.y * gridDim.y * blockDim.x * gridDim.x;
  if (p >= n) return;
-  x_list[p] = &x[i * x_str_0];
-  y_list[p] = &y[j * y_str_0];
-  out_list[p] = &out[xIdx[i] * o_str_0 + yIdx[j] * o_str_1];
+  x_list[p] = &x[b * x_str_0 + i * x_str_1];
+  y_list[p] = &y[b * x_str_0 + j * y_str_1];
+  out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
+                     yIdx[b * yI_str_0 + j] * o_str_1];
 }

 static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
@@ -464,7 +500,6 @@ static size_t %(n)s_xIdx_len;
 static npy_intp *%(n)s_yIdx;
 static size_t %(n)s_yIdx_len;

-// This is batch-ready
 static int %(n)s_prep(int b, int i, int j) {
  int s = b*i*j;
  if (%(n)s_list_len < s) {
@@ -515,8 +550,9 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
 """ % dict(out=out, o=o, fail=sub['fail'])

        return res + """
-if (%(name)s_prep(1, CudaNdarray_HOST_DIMS(%(x)s)[0],
-                  CudaNdarray_HOST_DIMS(%(y)s)[0]) == -1) {
+if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(x)s)[0],
+                  CudaNdarray_HOST_DIMS(%(x)s)[1],
+                  CudaNdarray_HOST_DIMS(%(y)s)[1]) == -1) {
  PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
  %(fail)s
 }
@@ -526,29 +562,32 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
 { %(fail)s }
 {
  dim3 block;
-  block.x = CudaNdarray_HOST_DIMS(%(x)s)[0];
-  block.y = CudaNdarray_HOST_DIMS(%(y)s)[0];
+  block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
+  block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
+  block.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
  SparseBlockOuter_fill_lists<<<block, 1>>>(
-block.x * block.y,
+block.x * block.y * block.z,
 %(name)s_x_list,
 %(name)s_y_list,
 %(name)s_out_list,
-CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
-CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
+CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1],
+CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1],
 CudaNdarray_DEV_DATA(%(out)s),
 CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
-%(name)s_xIdx,
-%(name)s_yIdx);
+%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
+%(name)s_yIdx, PyArray_DIM(%(yIdx)s, 1));
 }
 {
  cublasStatus_t err;
  err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_T,
-    CudaNdarray_HOST_DIMS(%(y)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1], 1,
+    CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2], 1,
    (float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list,
-    CudaNdarray_HOST_STRIDES(%(y)s)[0], %(name)s_x_list,
-    CudaNdarray_HOST_STRIDES(%(x)s)[0], (float *)PyArray_GETPTR1(%(beta)s, 0),
+    CudaNdarray_HOST_STRIDES(%(y)s)[1], %(name)s_x_list,
+    CudaNdarray_HOST_STRIDES(%(x)s)[1], (float *)PyArray_GETPTR1(%(beta)s, 0),
    %(name)s_out_list, CudaNdarray_HOST_STRIDES(%(out)s)[2],
-    CudaNdarray_HOST_DIMS(%(x)s)[0] * CudaNdarray_HOST_DIMS(%(y)s)[0]);
+    CudaNdarray_HOST_DIMS(%(x)s)[0] *
+    CudaNdarray_HOST_DIMS(%(x)s)[1] *
+    CudaNdarray_HOST_DIMS(%(y)s)[1]);
  if (err != CUBLAS_STATUS_SUCCESS) {
    PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed");
    %(fail)s
@@ -557,7 +596,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
            alpha=alpha, beta=beta, fail=sub['fail'])

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)


 sparse_block_outer_ss = SparseBlockOuterSS(False)
@@ -565,6 +604,12 @@ sparse_block_outer_ss_inplace = SparseBlockOuterSS(True)


 if cuda_available:
+    @opt.register_opt()
+    @opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
+    def local_inplace_blocksparse_gemv(node):
+        if node.op == sparse_block_gemv_ss:
+            return [sparse_block_gemv_ss_inplace(*node.inputs)]
+
    @opt.register_opt()
    @opt.local_optimizer([sparse_block_outer_ss], inplace=True)
    def local_inplace_blocksparse_outer(node):

--- a/theano/sandbox/cuda/tests/test_blocksparse.py
+++ b/theano/sandbox/cuda/tests/test_blocksparse.py
@@ -28,6 +28,9 @@ else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')


+def setup():
+    utt.seed_rng()
+
 def blocksparse_data():
    nInputBlock = 128
    nOutputBlock = 64
@@ -35,10 +38,11 @@ def blocksparse_data():
    outputSize = 30
    inputWindowSize = 7
    outputWindowSize = 9
+    batchSize = 4

-    input = randn(inputWindowSize, inputSize).astype('float32')
-    inputIndice = numpy.random.permutation(nInputBlock)[:inputWindowSize]
-    outputIndice = numpy.random.permutation(nOutputBlock)[:outputWindowSize]
+    input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
+    inputIndice = numpy.vstack(numpy.random.permutation(nInputBlock)[:inputWindowSize] for _ in range(batchSize))
+    outputIndice = numpy.vstack(numpy.random.permutation(nOutputBlock)[:outputWindowSize] for _ in range(batchSize))
    weight = randn(nInputBlock, nOutputBlock, inputSize, outputSize).astype('float32')
    bias = randn(nOutputBlock, outputSize).astype('float32')

@@ -47,24 +51,24 @@ def blocksparse_data():
 def blocksparse(W, h, iIdx, b, oIdx):
    o = b.take(oIdx, axis=0)

-    for j in range(o.shape[0]):
-        outputIdx = oIdx[j]
-
-        for i in range(h.shape[0]):
-            inputIdx = iIdx[i]
-            w = W[inputIdx, outputIdx]
-            # this below is a gemv I think
-            o[j, :] += numpy.dot(h[i], w)
+    for b in range(o.shape[0]):
+        for j in range(o.shape[1]):
+            outputIdx = oIdx[b, j]

+            for i in range(h.shape[1]):
+                inputIdx = iIdx[b, i]
+                w = W[inputIdx, outputIdx]
+                # this below is a gemv I think
+                o[b, j, :] += numpy.dot(h[b, i], w)
    return o


 def test_blocksparse():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
-    h = tensor.fmatrix()
-    iIdx = tensor.lvector()
-    oIdx = tensor.lvector()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()

    o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)

@@ -77,14 +81,16 @@ def test_blocksparse():

    utt.assert_allclose(ref_out, th_out)

+test_blocksparse.setup = setup
+

 # test the fortan order for W (which can happen in the grad for some graphs).
 def test_blocksparseF():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
-    h = tensor.fmatrix()
-    iIdx = tensor.lvector()
-    oIdx = tensor.lvector()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()

    o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
                                          (0, 1, 3, 2))(
@@ -102,9 +108,9 @@ def test_blocksparseF():


 def test_blocksparse_grad():
-    h_val = randn(2, 3).astype('float32')
-    iIdx_val = numpy.random.permutation(3)[:2]
-    oIdx_val = numpy.random.permutation(3)[:2]
+    h_val = randn(1, 2, 3).astype('float32')
+    iIdx_val = numpy.random.permutation(3)[:2][None, :]
+    oIdx_val = numpy.random.permutation(3)[:2][None, :]
    W_val = randn(3, 3, 3, 4).astype('float32')
    b_val = randn(3, 4).astype('float32')

@@ -120,9 +126,9 @@ def test_blocksparse_grad():
 def test_blocksparse_grad_shape():
    b = tensor.fmatrix()
    W = tensor.ftensor4()
-    h = tensor.fmatrix()
-    iIdx = tensor.lvector()
-    oIdx = tensor.lvector()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()

    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
    go = theano.grad(o.sum(), [b, W, h])
@@ -141,9 +147,9 @@ def test_blocksparse_grad_shape():

 def test_blocksparse_grad_merge():
    b = tensor.fmatrix()
-    h = tensor.fmatrix()
-    iIdx = tensor.lvector()
-    oIdx = tensor.lvector()
+    h = tensor.ftensor3()
+    iIdx = tensor.lmatrix()
+    oIdx = tensor.lmatrix()

    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
    W = float32_shared_constructor(W_val)