Fix most of the problems in blocksparse.

eb4d52aa · Arnaud Bergeron · 7479d045 · eb4d52aa · eb4d52aa · eb4d52aa
--- a/theano/sandbox/gpuarray/blockgemv.c
+++ b/theano/sandbox/gpuarray/blockgemv.c
+#section support_code_apply
+
+int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
+                              PyGpuArrayObject *h, PyArrayObject *inputIdx,
+                              PyArrayObject *outputIdx,
+                              PyGpuArrayObject **_out,
+                              PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL) {
+    // Error already set
+    return -1;
+  }
+#endif
+
+  gpudata **W_list = NULL;
+  gpudata **inp_list = NULL;
+  gpudata **out_list = NULL;
+  size_t *offW = NULL;
+  size_t *offInp = NULL;
+  size_t *offOut = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+
+  /* Prepare lists for the batch */
+  size_t maxi = PyGpuArray_DIMS(h)[1];
+  size_t maxj = PyGpuArray_DIMS(o)[1];
+  size_t maxb = PyGpuArray_DIMS(o)[0];
+  ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
+  ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(o)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(o)[1];
+  ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
+  ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
+
+  W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (W_list == NULL || offW == NULL ||
+      inp_list == NULL || offInp == NULL ||
+      out_list == NULL || offOut == NULL) {
+    free(W_list);
+    free(offW);
+    free(inp_list);
+    free(offInp);
+    free(out_list);
+    free(offOut);
+    PyErr_NoMemory();
+    return -1;
+  }
+
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        inp_list[p] = h->ga.data;
+        offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
+        out_list[p] = o->ga.data;
+        offOut[p] = b * o_str_0 + j * o_str_1 + o->ga.offset;
+        W_list[p] = W->ga.data;
+        offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
+          *(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
+          W->ga.offset;
+      }
+    }
+  }
+
+  cb_transpose transA = cb_no_trans;
+  size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
+  if (lda == 1) {
+    transA = cb_trans;
+    lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
+  }
+
+  if (o->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(o)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
+                               PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
+  } else if (o->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(o)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
+                               PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
+  } else {
+    err = GA_DEVSUP_ERROR;
+  }
+  
+  free(W_list);
+  free(offW);
+  free(inp_list);
+  free(offInp);
+  free(out_list);
+  free(offOut);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
+
--- a/theano/sandbox/gpuarray/blockger.c
+++ b/theano/sandbox/gpuarray/blockger.c
+#section support_code_apply
+
+int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
+                             PyGpuArrayObject *y, PyArrayObject *xIdx,
+                             PyArrayObject *yIdx, PyArrayObject *alpha,
+                             PyGpuArrayObject **_out,
+                             PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+  gpudata **o_list = NULL;
+  gpudata **x_list = NULL;
+  gpudata **y_list = NULL;
+  size_t *offOut = NULL;
+  size_t *offX = NULL;
+  size_t *offY = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL)
+    return -1;
+#endif
+  size_t maxi = PyGpuArray_DIMS(x)[1];
+  size_t maxj = PyGpuArray_DIMS(y)[1];
+  size_t maxb = PyGpuArray_DIMS(x)[0];
+
+  ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
+  ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
+  ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
+  ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
+
+  o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (o_list == NULL || offOut == NULL ||
+      x_list == NULL || offX == NULL ||
+      y_list == NULL || offY == NULL) {
+    free(o_list);
+    free(offOut);
+    free(x_list);
+    free(offX);
+    free(y_list);
+    free(offY);
+    PyErr_NoMemory();
+    return -1;
+  }
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        x_list[p] = x->ga.data;
+        offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
+        y_list[p] = y->ga.data;
+        offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
+        o_list[p] = out->ga.data;
+        offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
+      }
+    }
+  }
+
+  ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
+  ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
+  ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
+
+  if (out->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(float *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else if (out->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(double *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else {
+    err = GA_DEVSUP_ERROR;
+  }
+  free(o_list);
+  free(offOut);
+  free(x_list);
+  free(offX);
+  free(y_list);
+  free(offY);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "sgerBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
+
+
--- a/theano/sandbox/gpuarray/blocksparse.py
+++ b/theano/sandbox/gpuarray/blocksparse.py
 from __future__ import absolute_import, print_function, division
 import logging
+import os

 import numpy
-from theano import Op, Apply, tensor
+from theano import Apply, tensor
+from theano.gof import COp
 from theano.tensor import discrete_dtypes

 from theano.gradient import grad_undefined

-from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
+from .type import gpu_context_type
+from .basic_ops import as_gpuarray_variable, infer_context_name

 _logger = logging.getLogger('theano.sandbox.gpuarray.blocksparse')

-try:
-    import pygpu
-    from pygpu import gpuarray
-except ImportError:
-    pass

-class GpuSparseBlockGemv(Op):
+class GpuSparseBlockGemv(COp):
    """
    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
    information.
@@ -27,14 +25,32 @@ class GpuSparseBlockGemv(Op):
    function for a stable interface.
    """
    __props__ = ('inplace',)
+    params_type = gpu_context_type

    def __init__(self, inplace=False):
+        COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+
+    def c_headers(self):
+        return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
+                '<gpuarray_helper.h>']
+
    def make_node(self, o, W, h, inputIdx, outputIdx):
-        ctx = infer_context(o, W, h)
+        ctx = infer_context_name(o, W, h)
        o = as_gpuarray_variable(o, ctx)
        W = as_gpuarray_variable(W, ctx)
        h = as_gpuarray_variable(h, ctx)
@@ -53,123 +69,6 @@ class GpuSparseBlockGemv(Op):
    def infer_shape(self, node, input_shapes):
        return [input_shapes[0]]

-    def c_code(self, node, nodename, inputs, outputs, sub):
-        o, W, h, inputIdx, outputIdx = inputs
-        typecode = o.type.typecode
-        out = outputs[0]
-
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-%(out)s = theano_try_copy(%(out)s, %(o)s);
-if (%(out)s == NULL) {
-  // Error already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, typecode=typecode, fail=sub['fail'], ctx=sub['params'])
-
-        return res + """{
-        gpudata **W_list = NULL;
-        gpudata **inp_list = NULL;
-        gpudata **out_list = NULL;
-        size_t *offW = NULL;
-        size_t *offInp = NULL;
-        size_t *offOut = NULL;
-
-        { /* Prepare lists for the batch */
-          size_t maxi = PyGpuArray_DIMS(%(h)s)[1];
-          size_t maxj = PyGpuArray_DIMS(%(o)s)[1];
-          size_t maxb = PyGpuArray_DIMS(%(o)s)[0];
-          ssize_t h_str_0 = PyGpuArray_STRIDES(%(h)s)[0];
-          ssize_t h_str_1 = PyGpuArray_STRIDES(%(h)s)[1];
-          ssize_t o_str_0 = PyGpuArray_STRIDES(%(o)s)[0];
-          ssize_t o_str_1 = PyGpuArray_STRIDES(%(o)s)[1];
-          ssize_t W_str_0 = PyGpuArray_STRIDES(%(W)s)[0];
-          ssize_t W_str_1 = PyGpuArray_STRIDES(%(W)s)[1];
-
-          W_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-          offW = calloc(sizof(size_t), maxi * maxj * maxb);
-          inp_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-          offInp = calloc(sizof(size_t), maxi * maxj * maxb);
-          out_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-          offOut = calloc(sizof(size_t), maxi * maxj * maxb);
-          if (W_list == NULL || offW == NULL ||
-              inp_list == NULL || offInp == NULL ||
-              out_list == NULL || offOut == NULL) {
-            free(W_list);
-            free(offW);
-            free(inp_list);
-            free(offInp);
-            free(out_list);
-            free(offOut);
-            PyErr_NoMemory();
-            %(fail)s
-          }
-          for (size_t i = 0; i < maxi; i++) {
-            for (size_t j = 0; j < maxj; j++) {
-              for (size_t b = 0; b < maxb; b++) {
-                size_t p = i + j * maxi + b * maxi * maxj;
-                inp_list[p] = %(h)s->ga.data;
-                offInp[p] = b * h_str_0 + i * h_str_1 + %(h)s->ga.offset;
-                out_list[p] = %(o)s->ga.data;
-                outInp[p] = b * o_str_0 + j * o_str_1 + %(o)s->ga.offset;
-                W_list[p] = %(W)s->ga.data;
-                offW[p] = *(%(inputIdx)s_DTYPE *)PyArray_GETPTR2(%(inputIdx)s, b, i) * W_str_0 + *(%(outputIdx)s_DTYPE *)PyArray_GETPTR2(%(outputIdx)s, b, j) * W_str_1 + %(W)s->ga.offset;
-              }
-            }
-          }
-        }
-        { /* Run XgemvBatched */
-          int err;
-          cb_transpose transA = cb_no_trans;
-          size_t lda = PyGpuArray_STRIDES(%(W)s)[2];
-          if (lda == sizeof(float)) {
-            transA = cb_trans;
-            lda = PyGpuArray_STRIDES(%(W)s)[3];
-          }
-
-          if (%(typecode)s == GA_FLOAT) {
-            err = blas_ops->sgemvBatch(cb_c, transA,
-                             PyGpuArray_DIMS(%(o)s)[2],
-                             PyGpuArray_DIMS(%(h)s)[2], 1,
-                             W_list, offW, lda,
-                             inp_list, offInp, PyGpuArray_STRIDES(%(h)s)[2],
-                             1, out_list, offOut, PyGpuArray_STRIDES(%(o)s)[2],
-                             PyGpuArray_DIMS(%(o)s)[1] * PyGpuArray_DIMS(%(h)s)[1] * PyGpuArray_DIMS(%(o)s)[0], 0);
-
-          } else if (%(typecode)s == GA_DOUBLE) {
-            err = blas_ops->dgemvBatch(cb_c, transA,
-                             PyGpuArray_DIMS(%(o)s)[2],
-                             PyGpuArray_DIMS(%(h)s)[2], 1,
-                             W_list, offW, lda,
-                             inp_list, offInp, PyGpuArray_STRIDES(%(h)s)[2],
-                             1, out_list, offOut, PyGpuArray_STRIDES(%(o)s)[2],
-                             PyGpuArray_DIMS(%(o)s)[1] * PyGpuArray_DIMS(%(h)s)[1] * PyGpuArray_DIMS(%(o)s)[0], 0);
-          }
-          free(W_list);
-          free(offW);
-          free(inp_list);
-          free(offInp);
-          free(out_list);
-          free(offOut);
-          if (err != GA_NO_ERROR) {
-            PyErr_Format(PyExc_RuntimeError, "SgemvBatched failed(%%s)",
-                         cublasGetErrorString(err));
-            %(fail)s
-          }
-        }
-        // And we're done!
-        }""" % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
-                   W=W, fail=sub['fail'], name=nodename)
-
-    def c_code_cache_version(self):
-        return ()
-
    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]
@@ -191,7 +90,7 @@ gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
 gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)


-class GpuSparseBlockOuter(GpuOp):
+class GpuSparseBlockOuter(COp):
    """
    GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
    information.
@@ -201,17 +100,29 @@ class GpuSparseBlockOuter(GpuOp):
    of GpuSparseBlockGemv. The gradient is not implemented.
    """
    __props__ = ('inplace',)
+    params_type = gpu_context_type

    def __init__(self, inplace=False):
+        COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+
    def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
+        ctx = infer_context_name(o, x, y)
        one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
-        o = basic_ops.as_cuda_ndarray_variable(o)
-        x = basic_ops.as_cuda_ndarray_variable(x)
-        y = basic_ops.as_cuda_ndarray_variable(y)
+        o = as_gpuarray_variable(o, ctx)
+        x = as_gpuarray_variable(x, ctx)
+        y = as_gpuarray_variable(y, ctx)
        if alpha is None:
            alpha = one
        return Apply(self, [o, x, y, xIdx, yIdx, alpha],
@@ -220,121 +131,12 @@ class GpuSparseBlockOuter(GpuOp):
    def infer_shape(self, node, input_shapes):
        return [input_shapes[0]]

-    def c_support_code(self):
-        return """
-__global__ void
-SparseBlockOuter_fill_lists(
-int maxi, int maxj,
-const float **x_list,
-const float **y_list,
-float **out_list,
-const float *x, int x_str_0, int x_str_1,
-const float *y, int y_str_0, int y_str_1,
-float *out, int o_str_0, int o_str_1,
-const npy_intp *xIdx, int xI_str_0,
-const npy_intp *yIdx, int yI_str_0
-) {
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  int j = threadIdx.y + blockDim.y * blockIdx.y;
-  int b = blockIdx.z;
-  if (i >= maxi || j >= maxj) return;
-  int p = i + j * maxi + b * maxi * maxj;
-  x_list[p] = &x[b * x_str_0 + i * x_str_1];
-  y_list[p] = &y[b * y_str_0 + j * y_str_1];
-  out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
-                     yIdx[b * yI_str_0 + j] * o_str_1];
-}
-"""
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        o, x, y, xIdx, yIdx, alpha = inputs
-        out = outputs[0]
-        if self.inplace:
-            res = """
-Py_XDECREF(%(out)s);
-%(out)s = %(o)s;
-Py_INCREF(%(out)s);
-""" % dict(out=out, o=o)
-        else:
-            res = """
-%(out)s = theano_try_copy(%(out)s, %(o)s);
-if (%(out)s == NULL) {
-  // Error already set
-  %(fail)s
-}
-""" % dict(out=out, o=o, fail=sub['fail'])
-
-        return res + """
-{
-  size_t maxi = PyGpuArray_DIMS(%(x)s)[1];
-  size_t maxj = PyGpuArray_DIMS(%(y)s)[1];
-  size_t maxb = PyGpuArray_DIMS(%(x)s)[0];
-
-  ssize_t x_str_0 = PyGpuArray_STRIDES(%(x)s)[0];
-  ssize_t x_str_1 = PyGpuArray_STRIDES(%(x)s)[1];
-  ssize_t y_str_0 = PyGpuArray_STRIDES(%(y)s)[0];
-  ssize_t y_str_1 = PyGpuArray_STRIDES(%(y)s)[1];
-  ssize_t o_str_0 = PyGpuArray_STRIDES(%(out)s)[0];
-  ssize_t o_str_1 = PyGpuArray_STRIDES(%(out)s)[1];
-
-  o_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-  offOut = calloc(sizof(size_t), maxi * maxj * maxb);
-  x_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-  offX = calloc(sizof(size_t), maxi * maxj * maxb);
-  y_list = calloc(sizof(gpudata *), maxi * maxj * maxb);
-  offY = calloc(sizof(size_t), maxi * maxj * maxb);
-  if (W_list == NULL || offW == NULL ||
-      inp_list == NULL || offInp == NULL ||
-      out_list == NULL || offOut == NULL) {
-            free(o_list);
-            free(offOut);
-            free(x_list);
-            free(offX);
-            free(y_list);
-            free(offY);
-            PyErr_NoMemory();
-            %(fail)s
-          }
-  for (size_t i = 0; i < maxi; i++) {
-    for (size_t j = 0; j < maxj; j++) {
-      for (size_t b = 0; b < maxb; b++) {
-        size_t p = i + j * maxi + b * maxi * maxj;
-        x_list[p] = %(x)s->ga.data;
-        offX[p] = b * x_str_0 + i * x_str_1 + %(x)s->ga.offset;
-        y_list[p] = %(y)s->ga.data;
-        offY[p] = b * y_str_0 + j * y_str_1 + %(y)s->ga.offset;
-        out_list[p] = %(out)s->ga.data;
-        offOut[p] = *(%(xIdx)s_DTYPE *)PyArray_GETPTR2(%(xIdx)s, b, i) * o_str_0 + *(%(yIdx)s_DTYPE *)PyArray_GETPTR2(%(yIdx)s, b, j) * o_str_1 + %(out)s->ga.offset;
-      }
-    }
-  }
-{
-  ga_ssize str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
-  ga_ssize str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
-  ga_ssize str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
-  int err;
-
-  err = blas_ops->sgerBatch(cb_fortran,
-    PyGpuArray_DIMS(%(y)s)[2], PyGpuArray_DIMS(%(x)s)[2],
-    *(float *)PyArray_GETPTR1(%(alpha)s, 0),
-    y_list, offY, str_y, x_list, offX, str_x, out_list, offOut, str_out,
-    PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[1] * PyGpuArray_DIMS(%(y)s)[1], 0);
-  free(o_list);
-  free(offOut);
-  free(x_list);
-  free(offX);
-  free(y_list);
-  free(offY);
-  if (err != GA_NO_ERROR) {
-    PyErr_Format(PyExc_RuntimeError, "sgerBatch failed");
-    %(fail)s
-  }
-}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
-            alpha=alpha, fail=sub['fail'])
-
-    def c_code_cache_version(self):
-        return (11,)
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]

+    def c_headers(self):
+        return ['<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
+                '<gpuarray_helper.h>']

 gpu_sparse_block_outer = GpuSparseBlockOuter(False)
 gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -8,7 +8,7 @@ import theano
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.gof import (local_optimizer, EquilibriumDB,
+from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, toolbox)
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt

 from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs)
@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemmbatch_no_inplace)
+from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
        return local_opt
    return f

+
+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpuarray', *tags)
+        return local_opt
+    return f
+
 register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 register_opt(final_opt=True, name='gpua_constant_folding')(
    tensor.opt.constant_folding)
@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node, context_name):
-
+    context = get_context(context_name)
    # This is disabled on non-cuda contexts
-    if get_context(context_name).kind != 'cuda':
+    if context.kind != 'cuda':
        return None

    x, y, ilist = node.inputs
@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
            y = tensor.cast(y, dtype)

    set_instead_of_inc = node.op.set_instead_of_inc
-    active_device_no = theano.sandbox.cuda.active_device_number()
-    device_properties = theano.sandbox.cuda.device_properties

-    compute_capability = device_properties(active_device_no)['major']
+    compute_capability = int(context.bin_id[-2])

    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
        return GpuAdvancedIncSubtensor1(
@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
 """


+@register_opt('fast_compile')
+@op_lifter([SparseBlockGemv])
+def local_lift_sparseblockgemv(node, context_name):
+    return GpuSparseBlockGemv(node.op.inplace)
+
+
+@register_opt('fast_compile')
+@op_lifter([SparseBlockOuter])
+def local_lift_sparseblockouter(node, context_name):
+    return GpuSparseBlockOuter(node.op.inplace)
+
+
+@register_inplace()
+@local_optimizer([GpuSparseBlockGemv], inplace=True)
+def local_inplace_sparseblockgemv(node):
+    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
+        return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
+
+
+@register_inplace()
+@local_optimizer([GpuSparseBlockOuter], inplace=True)
+def local_inplace_sparseblockouter(node):
+    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
+        return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
+
+
 # This deals with any abstract convs that have a transfer somewhere
 @register_opt('fast_compile')
 @op_lifter([AbstractConv2d,

--- a/theano/tensor/nnet/tests/test_blocksparse.py
+++ b/theano/tensor/nnet/tests/test_blocksparse.py
@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)

    def test_sparseblockgemv_grad_1(self):
-        """
-            Test that we correctly handle cases where dimensions are 1.
-        """
+        # Test that we correctly handle cases where dimensions are 1.
        h_val = randn(1, 1, 1).astype('float32')
        iIdx_val = numpy.random.permutation(1)[:1][None, :]
        oIdx_val = numpy.random.permutation(1)[:1][None, :]