Fix most of the problems in blocksparse.

eb4d52aa · Arnaud Bergeron · 7479d045 · eb4d52aa · eb4d52aa · eb4d52aa
--- a/theano/sandbox/gpuarray/blockgemv.c
+++ b/theano/sandbox/gpuarray/blockgemv.c
+#section support_code_apply
+int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
+                              PyGpuArrayObject *h, PyArrayObject *inputIdx,
+                              PyArrayObject *outputIdx,
+                              PyGpuArrayObject **_out,
+                              PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL) {
+    // Error already set
+    return -1;
+  }
+#endif
+  gpudata **W_list = NULL;
+  gpudata **inp_list = NULL;
+  gpudata **out_list = NULL;
+  size_t *offW = NULL;
+  size_t *offInp = NULL;
+  size_t *offOut = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+  /* Prepare lists for the batch */
+  size_t maxi = PyGpuArray_DIMS(h)[1];
+  size_t maxj = PyGpuArray_DIMS(o)[1];
+  size_t maxb = PyGpuArray_DIMS(o)[0];
+  ssize_t h_str_0 = PyGpuArray_STRIDES(h)[0];
+  ssize_t h_str_1 = PyGpuArray_STRIDES(h)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(o)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(o)[1];
+  ssize_t W_str_0 = PyGpuArray_STRIDES(W)[0];
+  ssize_t W_str_1 = PyGpuArray_STRIDES(W)[1];
+  W_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offW = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  inp_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offInp = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  out_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (W_list == NULL || offW == NULL ||
+      inp_list == NULL || offInp == NULL ||
+      out_list == NULL || offOut == NULL) {
+    free(W_list);
+    free(offW);
+    free(inp_list);
+    free(offInp);
+    free(out_list);
+    free(offOut);
+    PyErr_NoMemory();
+    return -1;
+  }
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        inp_list[p] = h->ga.data;
+        offInp[p] = b * h_str_0 + i * h_str_1 + h->ga.offset;
+        out_list[p] = o->ga.data;
+        offOut[p] = b * o_str_0 + j * o_str_1 + o->ga.offset;
+        W_list[p] = W->ga.data;
+        offW[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(inputIdx, b, i) * W_str_0 +
+          *(DTYPE_INPUT_4 *)PyArray_GETPTR2(outputIdx, b, j) * W_str_1 +
+          W->ga.offset;
+      }
+    }
+  }
+  cb_transpose transA = cb_no_trans;
+  size_t lda = PyGpuArray_STRIDES(W)[2] / gpuarray_get_elsize(W->ga.typecode);
+  if (lda == 1) {
+    transA = cb_trans;
+    lda = PyGpuArray_STRIDES(W)[3] / gpuarray_get_elsize(W->ga.typecode);
+  }
+  if (o->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(o)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
+                               PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
+  } else if (o->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgemvBatch(cb_fortran, transA,
+                               PyGpuArray_DIMS(o)[2],
+                               PyGpuArray_DIMS(h)[2], 1,
+                               W_list, offW, lda,
+                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                               1, out_list, offOut, PyGpuArray_STRIDES(o)[2] / gpuarray_get_elsize(o->ga.typecode),
+                               PyGpuArray_DIMS(o)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(o)[0], 0);
+  } else {
+    err = GA_DEVSUP_ERROR;
+  }
+  free(W_list);
+  free(offW);
+  free(inp_list);
+  free(offInp);
+  free(out_list);
+  free(offOut);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "gemvBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
--- a/theano/sandbox/gpuarray/blockger.c
+++ b/theano/sandbox/gpuarray/blockger.c
+#section support_code_apply
+int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
+                             PyGpuArrayObject *y, PyArrayObject *xIdx,
+                             PyArrayObject *yIdx, PyArrayObject *alpha,
+                             PyGpuArrayObject **_out,
+                             PyGpuContextObject *ctx) {
+  PyGpuArrayObject *out = *_out;
+  gpudata **o_list = NULL;
+  gpudata **x_list = NULL;
+  gpudata **y_list = NULL;
+  size_t *offOut = NULL;
+  size_t *offX = NULL;
+  size_t *offY = NULL;
+  gpuarray_blas_ops *blas_ops;
+  int err;
+  err = ctx->ops->property(ctx->ctx, NULL, NULL,
+                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
+    return -1;
+  }
+  err = blas_ops->setup(ctx->ctx);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
+    return -1;
+  }
+#ifdef INPLACE
+  Py_XDECREF(out);
+  out = o;
+  Py_INCREF(out);
+#else
+  out = theano_try_copy(out, o);
+  if (out == NULL)
+    return -1;
+#endif
+  size_t maxi = PyGpuArray_DIMS(x)[1];
+  size_t maxj = PyGpuArray_DIMS(y)[1];
+  size_t maxb = PyGpuArray_DIMS(x)[0];
+  ssize_t x_str_0 = PyGpuArray_STRIDES(x)[0];
+  ssize_t x_str_1 = PyGpuArray_STRIDES(x)[1];
+  ssize_t y_str_0 = PyGpuArray_STRIDES(y)[0];
+  ssize_t y_str_1 = PyGpuArray_STRIDES(y)[1];
+  ssize_t o_str_0 = PyGpuArray_STRIDES(out)[0];
+  ssize_t o_str_1 = PyGpuArray_STRIDES(out)[1];
+  o_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offOut = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  x_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offX = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  y_list = (gpudata **)calloc(sizeof(gpudata *), maxi * maxj * maxb);
+  offY = (size_t *)calloc(sizeof(size_t), maxi * maxj * maxb);
+  if (o_list == NULL || offOut == NULL ||
+      x_list == NULL || offX == NULL ||
+      y_list == NULL || offY == NULL) {
+    free(o_list);
+    free(offOut);
+    free(x_list);
+    free(offX);
+    free(y_list);
+    free(offY);
+    PyErr_NoMemory();
+    return -1;
+  }
+  for (size_t i = 0; i < maxi; i++) {
+    for (size_t j = 0; j < maxj; j++) {
+      for (size_t b = 0; b < maxb; b++) {
+        size_t p = i + j * maxi + b * maxi * maxj;
+        x_list[p] = x->ga.data;
+        offX[p] = b * x_str_0 + i * x_str_1 + x->ga.offset;
+        y_list[p] = y->ga.data;
+        offY[p] = b * y_str_0 + j * y_str_1 + y->ga.offset;
+        o_list[p] = out->ga.data;
+        offOut[p] = *(DTYPE_INPUT_3 *)PyArray_GETPTR2(xIdx, b, i) * o_str_0 + *(DTYPE_INPUT_4 *)PyArray_GETPTR2(yIdx, b, j) * o_str_1 + out->ga.offset;
+      }
+    }
+  }
+  ssize_t str_y = PyGpuArray_STRIDES(y)[2] / gpuarray_get_elsize(y->ga.typecode);
+  ssize_t str_x = PyGpuArray_STRIDES(x)[2] / gpuarray_get_elsize(x->ga.typecode);
+  ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);
+  if (out->ga.typecode == GA_FLOAT) {
+    err = blas_ops->sgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(float *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else if (out->ga.typecode == GA_DOUBLE) {
+    err = blas_ops->dgerBatch(cb_fortran,
+                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                              *(double *)PyArray_GETPTR1(alpha, 0),
+                              y_list, offY, str_y, x_list, offX, str_x,
+                              o_list, offOut, str_out,
+                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+  } else {
+    err = GA_DEVSUP_ERROR;
+  }
+  free(o_list);
+  free(offOut);
+  free(x_list);
+  free(offX);
+  free(y_list);
+  free(offY);
+  if (err != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "sgerBatch failed");
+    return -1;
+  }
+  *_out = out;
+  return 0;
+}
--- a/theano/sandbox/gpuarray/blocksparse.py
+++ b/theano/sandbox/gpuarray/blocksparse.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -8,7 +8,7 @@ import theano
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.gof import (local_optimizer, EquilibriumDB,
+from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
                        SequenceDB, Optimizer, toolbox)
 from theano.gof.optdb import LocalGroupDB
 from theano.ifelse import IfElse
@@ -17,6 +17,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs)
@@ -33,6 +34,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemmbatch_no_inplace)
+from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -73,6 +75,17 @@ def register_opt(*tags, **kwargs):
        return local_opt
    return f
+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpuarray', *tags)
+        return local_opt
+    return f
 register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)
 register_opt(final_opt=True, name='gpua_constant_folding')(
    tensor.opt.constant_folding)
@@ -619,9 +632,9 @@ def local_gpua_advanced_subtensor(node, context_name):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node, context_name):
+    context = get_context(context_name)
    # This is disabled on non-cuda contexts
-    if get_context(context_name).kind != 'cuda':
+    if context.kind != 'cuda':
        return None
    x, y, ilist = node.inputs
@@ -635,10 +648,8 @@ def local_gpua_advanced_incsubtensor(node, context_name):
            y = tensor.cast(y, dtype)
    set_instead_of_inc = node.op.set_instead_of_inc
-    active_device_no = theano.sandbox.cuda.active_device_number()
-    device_properties = theano.sandbox.cuda.device_properties
-    compute_capability = device_properties(active_device_no)['major']
+    compute_capability = int(context.bin_id[-2])
    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
        return GpuAdvancedIncSubtensor1(
@@ -865,6 +876,32 @@ theano.tensor.nnet.conv2d()
 """
+@register_opt('fast_compile')
+@op_lifter([SparseBlockGemv])
+def local_lift_sparseblockgemv(node, context_name):
+    return GpuSparseBlockGemv(node.op.inplace)
+@register_opt('fast_compile')
+@op_lifter([SparseBlockOuter])
+def local_lift_sparseblockouter(node, context_name):
+    return GpuSparseBlockOuter(node.op.inplace)
+@register_inplace()
+@local_optimizer([GpuSparseBlockGemv], inplace=True)
+def local_inplace_sparseblockgemv(node):
+    if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
+        return [GpuSparseBlockGemv(inplace=True)(*node.inputs)]
+@register_inplace()
+@local_optimizer([GpuSparseBlockOuter], inplace=True)
+def local_inplace_sparseblockouter(node):
+    if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
+        return [GpuSparseBlockOuter(inplace=True)(*node.inputs)]
 # This deals with any abstract convs that have a transfer somewhere
 @register_opt('fast_compile')
 @op_lifter([AbstractConv2d,

--- a/theano/tensor/nnet/tests/test_blocksparse.py
+++ b/theano/tensor/nnet/tests/test_blocksparse.py
@@ -216,9 +216,7 @@ class BlockSparse_Gemv_and_Outer(utt.InferShapeTester):
        utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode, eps=eps)
    def test_sparseblockgemv_grad_1(self):
-        """
+        # Test that we correctly handle cases where dimensions are 1.
-            Test that we correctly handle cases where dimensions are 1.
-        """
        h_val = randn(1, 1, 1).astype('float32')
        iIdx_val = numpy.random.permutation(1)[:1][None, :]
        oIdx_val = numpy.random.permutation(1)[:1][None, :]