Fisrt pass at C code for Gemm16 (does not work).

0b7f31fe · Arnaud Bergeron · 70a7eca3 · 0b7f31fe · 0b7f31fe
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
+#section init_code_struct
+/* Why do we need this? */
+size_t dim = 2048 * 32;
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+                       Py_None);
+if (rand_buf == NULL) {
+  FAIL;
+}
+#section support_code_struct
+PyGpuArrayObject *rand_buf;
+int gemm16(PyGpuArrayObject *C, float alpha,
+           PyGpuArrayObject *A, PyGpuArrayObject *B,
+           float beta, PyGpuArrayObject **out) {
+  PyGpuArrayObject *AA = NULL;
+  PyGpuArrayObject *BB = NULL;
+  GpuKernel *gk;
+  void *params[13];
+  size_t grid[2];
+  size_t threads[2];
+  int res = 0;
+  int flags = 0;
+  int lda, ldb, ldc, n, m, k;
+  int n128, n64;
+  int size = 0;
+  int vec = 0;
+  static unsigned int nprocs = 0;
+  char opA, opB;
+  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
+      GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
+    /*
+     * The nervana kernels do not cover the case where both inputs are
+     * trans so we need to copy one of them.  We choose the smallest
+     * one.
+     */
+    if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
+        PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
+      AA = pygpu_copy(A, GA_C_ORDER);
+      if (AA == NULL) {
+        res = 1;
+        goto cleanup;
+      }
+      BB = B;
+      Py_INCREF(BB);
+    } else {
+      BB = pygpu_copy(B, GA_C_ORDER);
+      if (BB == NULL) {
+        res = 1;
+        goto cleanup;
+      }
+      AA = A;
+      Py_INCREF(AA);
+    }
+  }
+  if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
+    Py_XDECREF(*out);
+    *out = C;
+    Py_INCREF(*out);
+  } else {
+    *out = theano_try_copy(*out, C);
+    if (*out == NULL) {
+      res = 1;
+      goto cleanup;
+    }
+  }
+  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY))
+    opA = 't';
+  else
+    opA = 'n';
+  if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY))
+    opB = 't';
+  else
+    opB = 'n';
+  m = PyGpuArray_DIM(C, 0);
+  n = PyGpuArray_DIM(C, 1);
+  k = PyGpuArray_DIM(B, 0);
+  /* Tuning code adapted from the python version */
+  grid[0] = (m + 127) / 128;
+  if (opA == 'n' && opB == 't')
+    size = 128;
+  else {
+    if (n < 384-16) {
+      n128 = n % 128;
+      if (n128 < 112) {
+        if (48 < n128 && n128 <= 64) {
+          n64 = n / 64;
+          if (nprocs == 0)
+            if (C->ga.ops->property(C->context->ctx, NULL, NULL,
+                                    GA_CTX_PROP_NUMPROCS, &nprocs)) {
+              nprocs = 0;
+              res = 1;
+              goto cleanup;
+            }
+          n64 *= (grid[0] / nprocs);
+          if (n64 > 1 || (opA == 't' && opB == 'n'))
+            size = 64;
+          else
+            size = 32;
+        } else {
+          size = 32;
+        }
+      } else {
+        size = 128;
+      }
+    } else {
+      size = 128;
+    }
+  }
+  grid[1] = (n + (size-1)) / size;
+  if (size == 128)
+    threads[0] = 256;
+  else
+    threads[0] = 128;
+  threads[1] = 1;
+  if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
+      (opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
+      (opA == 'n' && opB == '1' && k % 16 == 0))
+    vec = 1;
+  switch (size) {
+  case 128:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x128;
+      else
+        gk = &k_nn_128x128;
+    } else if (opA == 'n' && opB == 't') {
+      if (vec)
+        gk = &k_nt_vec_128x128;
+      else
+        gk = &k_nt_128x128;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x128;
+      else
+        gk = &k_tn_128x128;
+    }
+    break;
+  case 64:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x64;
+      else
+        gk = &k_nn_128x64;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x64;
+      else
+        gk = &k_tn_128x64;
+    }
+    break;
+  case 32:
+    if (opA == 'n' && opB == 'n') {
+      if (vec)
+        gk = &k_nn_vec_128x32;
+      else
+        gk = &k_nn_128x32;
+    } else if (opA == 't' && opB == 'n') {
+      if (vec)
+        gk = &k_tn_vec_128x32;
+      else
+        gk = &k_tn_128x32;
+    }
+    break;
+  default:
+    PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
+    res = 1;
+    goto cleanup;
+  }
+  params[0] = &rand_buf->ga;
+  params[1] = &A->ga;
+  params[2] = &B->ga;
+  params[3] = &C->ga;
+  params[4] = &lda;
+  params[5] = &ldb;
+  params[6] = &ldc;
+  params[7] = &m;
+  params[8] = &n;
+  params[9] = &k;
+  params[10] = &alpha;
+  params[11] = &beta;
+  params[12] = &flags;
+  printf("%c%c_%s128x%d\n", opA, opB, vec ? "vec_" : "", size);
+  printf("%zu %zu %zu %zu\n", rand_buf->ga.offset, A->ga.offset, B->ga.offset, C->ga.offset);
+  printf("%p %p %p %p\n", *((void **)rand_buf->ga.data), *((void **)A->ga.data), *((void **)B->ga.data), *((void **)C->ga.data));
+  if (GpuKernel_call2(gk, NULL, threads, grid, params) != GA_NO_ERROR) {
+    PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
+    res = 1;
+  }
+cleanup:
+  Py_XDECREF(AA);
+  Py_XDECREF(BB);
+  return res;
+}
--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
+import os.path
 import numpy
 import theano
 from theano import Op, Apply, Variable, tensor
 from theano.compile import optdb
 from theano.compile.ops import shape_i
-from theano.gof import local_optimizer
+from theano.gof import local_optimizer, COp
 from theano.scalar import as_scalar, constant
 from . import opt
@@ -40,17 +41,24 @@ def ensure_float(val, name):
    return val
-class Gemm16(Op):
+class Gemm16(COp):
    __props__ = ('relu', 'inplace')
    _f16_ok = True
+    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
+                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
+                  'tn_128x128', 'tn_128x64', 'tn_128x32',
+                  'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
+                  'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
    def __init__(self, relu=False, inplace=False):
+        COp.__init__(self, ["gemm16.c"], "gemm16")
        self.relu = relu
        # relu = True will require more work in optimizations.
        assert self.relu == False
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}
+        self._use_c_code = False
    def make_node(self, C, alpha, A, B, beta):
        if GPUTensor is None:
@@ -89,6 +97,70 @@ class Gemm16(Op):
        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
        outputs[0][0] = C
+    def c_headers(self):
+        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
+                'string.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def get_op_params(self):
+        return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
+    @staticmethod
+    def cubin_to_code(name):
+        fname = 'hgemm_{0}.cubin'.format(name)
+        with open(os.path.join(nerv.cubin_path, fname)) as f:
+            cubin = f.read()
+        bcode = ','.join(hex(ord(c)) for c in cubin)
+        return "static const char bin_%s[] = { %s };" % (name, bcode)
+    @staticmethod
+    def init_gpukernel(name, fail):
+        return """
+bcode = bin_%(name)s;
+sz = sizeof(bin_%(name)s);
+if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
+                   "hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
+    != GA_NO_ERROR) {
+  PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
+  %(fail)s;
+}
+""" % dict(name=name, fail=fail)
+    def c_support_code(self):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append(Gemm16.cubin_to_code(name))
+        return '\n'.join(codel)
+    def c_support_code_struct(self, node, nodename):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append("GpuKernel k_{0};".format(name))
+        codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
+        return '\n'.join(codel)
+    def c_init_code_struct(self, node, nodename, sub):
+        codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
+        for name in self.KERN_NAMES:
+            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name));
+        codel.append("const char *bcode;")
+        codel.append("size_t sz;")
+        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
+                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
+                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
+        for name in self.KERN_NAMES:
+            codel.append(self.init_gpukernel(name, sub['fail']))
+        return '\n'.join(codel)
+    def c_cleanup_code_struct(self, node, nodename):
+        codel = []
+        for name in self.KERN_NAMES:
+            codel.append("GpuKernel_clear(&k_{0});".format(name))
+        return '\n'.join(codel)
 @opt.register_opt()
 @local_optimizer([tensor.Dot])
@@ -104,7 +176,6 @@ def local_dot_to_gemm16(node):
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return [host_from_gpu(Gemm16()(C, 1.0, A, B, 0.0))]
 @opt.register_opt()
 @alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
 def local_gemm16_alpha_merge(node, *inputs):