Remove module `nerv`.

808f855b · notoraptor · 13ff40a3 · 808f855b · 808f855b · 13ff40a3
--- a/doc/library/gpuarray/op.txt
+++ b/doc/library/gpuarray/op.txt
@@ -22,9 +22,6 @@ Blas Op
 .. automodule:: theano.gpuarray.blas
    :members:
-.. automodule:: theano.gpuarray.nerv
-    :members:
 Elemwise Op
 ===========

--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction, rng_mrg
+from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg
 def transfer(x, target):

--- a/theano/gpuarray/gemm16.c
+++ b/theano/gpuarray/gemm16.c
-#section init_code_struct
-/* Why do we need this? */
-size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, PARAMS,
-                       Py_None);
-if (rand_buf == NULL) {
-  FAIL;
-}
-#section support_code_struct
-PyGpuArrayObject *rand_buf;
-int gemm16(PyGpuArrayObject *C, float alpha,
-           PyGpuArrayObject *A, PyGpuArrayObject *B,
-           float beta, PyGpuArrayObject **out,
-           PyGpuContextObject *c) {
-  PyGpuArrayObject *_A = NULL;
-  PyGpuArrayObject *_B = NULL;
-  GpuKernel *gk;
-  char *prand, *pA, *pB, *pout;
-  void *params[13];
-  size_t grid[2];
-  size_t threads[2];
-  int res = 0;
-  int flags = 0;
-  int lda, ldb, ldc, n, m, k;
-  int n128, n64;
-  int size = 0;
-  int vec = 0;
-  static unsigned int nprocs = 0;
-  char opA, opB;
-  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
-      GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
-    /*
-     * The nervana kernels do not cover the case where both inputs are
-     * trans so we need to copy one of them.  We choose the smallest
-     * one.
-     */
-    if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
-        PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
-      _A = pygpu_copy(A, GA_C_ORDER);
-      if (_A == NULL) {
-        res = 1;
-        goto cleanup;
-      }
-      /*
-       * This is not an extra reference on _A so don't add an INCREF.
-       * Also, we don't lose the ref on A since our caller will deal
-       * with it.
-       */
-      A = _A;
-    } else {
-      _B = pygpu_copy(B, GA_C_ORDER);
-      if (_B == NULL) {
-        res = 1;
-        goto cleanup;
-      }
-      /*
-       * This is not an extra reference on _B so don't add an INCREF
-       * Also, we don't lose the ref on B since our caller will deal
-       * with it.
-       */
-      B = _B;
-    }
-  }
-  if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
-    Py_XDECREF(*out);
-    *out = C;
-    Py_INCREF(*out);
-  } else {
-    *out = theano_try_copy(*out, C);
-    if (*out == NULL) {
-      res = 1;
-      goto cleanup;
-    }
-  }
-  if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
-    opA = 't';
-    lda = PyGpuArray_STRIDE(A, 1);
-  } else {
-    opA = 'n';
-    lda = PyGpuArray_STRIDE(A, 0);
-  }
-  if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
-    opB = 't';
-    ldb = PyGpuArray_STRIDE(B, 1);
-  } else {
-    opB = 'n';
-    ldb = PyGpuArray_STRIDE(B, 0);
-  }
-  ldc = PyGpuArray_STRIDE(*out, 0);
-  /* lda and friend are in number of elements, not bytes */
-  lda /= 2;
-  ldb /= 2;
-  ldc /= 2;
-  m = PyGpuArray_DIM(*out, 0);
-  n = PyGpuArray_DIM(*out, 1);
-  k = PyGpuArray_DIM(B, 0);
-  /* Tuning code adapted from the python version */
-  grid[0] = (m + 127) / 128;
-  if (opA == 'n' && opB == 't')
-    size = 128;
-  else {
-    if (n < 384-16) {
-      n128 = n % 128;
-      if (n128 < 112) {
-        if (48 < n128 && n128 <= 64) {
-          n64 = n / 64;
-          if (nprocs == 0)
-            if (gpucontext_property(A->context->ctx,
-                                    GA_CTX_PROP_NUMPROCS, &nprocs)) {
-              nprocs = 0;
-              res = 1;
-              goto cleanup;
-            }
-          n64 *= (grid[0] / nprocs);
-          if (n64 > 1 || (opA == 't' && opB == 'n'))
-            size = 64;
-          else
-            size = 32;
-        } else {
-          size = 32;
-        }
-      } else {
-        size = 128;
-      }
-    } else {
-      size = 128;
-    }
-  }
-  grid[1] = (n + (size-1)) / size;
-  if (size == 128)
-    threads[0] = 256;
-  else
-    threads[0] = 128;
-  threads[1] = 1;
-  if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
-      (opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
-      (opA == 'n' && opB == 't' && k % 16 == 0))
-    vec = 1;
-  switch (size) {
-  case 128:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x128;
-      else
-        gk = &k_nn_128x128;
-    } else if (opA == 'n' && opB == 't') {
-      if (vec)
-        gk = &k_nt_vec_128x128;
-      else
-        gk = &k_nt_128x128;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x128;
-      else
-        gk = &k_tn_128x128;
-    }
-    break;
-  case 64:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x64;
-      else
-        gk = &k_nn_128x64;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x64;
-      else
-        gk = &k_tn_128x64;
-    }
-    break;
-  case 32:
-    if (opA == 'n' && opB == 'n') {
-      if (vec)
-        gk = &k_nn_vec_128x32;
-      else
-        gk = &k_nn_128x32;
-    } else if (opA == 't' && opB == 'n') {
-      if (vec)
-        gk = &k_tn_vec_128x32;
-      else
-        gk = &k_tn_128x32;
-    }
-    break;
-  default:
-    PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
-    res = 1;
-    goto cleanup;
-  }
-  prand = *((char **)rand_buf->ga.data);
-  prand += rand_buf->ga.offset;
-  pA = *((char **)A->ga.data);
-  pA += A->ga.offset;
-  pB = *((char **)B->ga.data);
-  pB += B->ga.offset;
-  pout = *((char **)(*out)->ga.data);
-  pout += (*out)->ga.offset;
-  params[0] = &prand;
-  params[1] = &pA;
-  params[2] = &pB;
-  params[3] = &pout;
-  params[4] = &lda;
-  params[5] = &ldb;
-  params[6] = &ldc;
-  params[7] = &m;
-  params[8] = &n;
-  params[9] = &k;
-  params[10] = &alpha;
-  params[11] = &beta;
-  params[12] = &flags;
-  if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
-    res = 1;
-  }
-cleanup:
-  Py_XDECREF(_A);
-  Py_XDECREF(_B);
-  return res;
-}
--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
-from __future__ import absolute_import, print_function, division
+from nose.plugins.skip import SkipTest
-import os.path
-import theano
-from theano import Apply, Variable, tensor
-from theano.compile import optdb
+raise SkipTest("You are importing theano.gpuarray.nerv. "
-from theano.compile.ops import shape_i
+               "This module was removed as it was based on nervanagpu that is now deprecated. "
-from theano.gof import local_optimizer, COp
+               "To still get this module, use Theano 0.9. "
-from theano.scalar import as_scalar, constant
+               "More info about nervanagpu here: https://github.com/NervanaSystems/nervanagpu "
+               "(viewed on 2017/07/05).")
-from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
-                        infer_context_name)
-from .type import gpu_context_type
-from .opt_util import alpha_merge, output_merge
-try:
-    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
-    nerv = NervanaGPU()
-except ImportError:
-    GPUTensor = None
-    nerv = None
-def to_gputensor(a):
-    assert a.flags.c_contiguous or a.flags.f_contiguous
-    return GPUTensor(a.shape, dtype=a.dtype, base=a,
-                     gpudata=a.gpudata + a.offset,
-                     strides=a.strides, is_trans=a.flags.f_contiguous)
-def ensure_float(val, name):
-    if not isinstance(val, Variable):
-        val = constant(val)
-    if hasattr(val, 'ndim') and val.ndim == 0:
-        val = as_scalar(val)
-    if not isinstance(val.type, theano.scalar.Scalar):
-        raise TypeError("%s: expected a scalar value" % (name,))
-    if not val.type.dtype == 'float32':
-        raise TypeError("%s: type is not float32" % (name,))
-    return val
-class Gemm16(COp):
-    """
-    Gemm for float16 using the nervena kernels.
-    """
-    __props__ = ('relu', 'inplace')
-    _f16_ok = True
-    params_type = gpu_context_type
-    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
-                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
-                  'tn_128x128', 'tn_128x64', 'tn_128x32',
-                  'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
-                  'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
-    def __init__(self, relu=False, inplace=False):
-        COp.__init__(self, ["gemm16.c"], "gemm16")
-        self.relu = relu
-        # relu = True will require more work in optimizations.
-        assert self.relu is False
-        self.inplace = inplace
-        if self.inplace:
-            self.destroy_map = {0: [0]}
-    def make_node(self, C, alpha, A, B, beta):
-        if GPUTensor is None:
-            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
-        ctx_name = infer_context_name(C, A, B)
-        A = as_gpuarray_variable(A, ctx_name)
-        B = as_gpuarray_variable(B, ctx_name)
-        C = as_gpuarray_variable(C, ctx_name)
-        alpha = ensure_float(alpha, 'alpha')
-        beta = ensure_float(beta, 'beta')
-        assert C.dtype == A.dtype == B.dtype == 'float16'
-        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-    def get_params(self, node):
-        return node.inputs[0].type.context
-    def c_headers(self):
-        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
-                'string.h']
-    def c_header_dirs(self):
-        return [os.path.dirname(__file__)]
-    def get_op_params(self):
-        return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
-    @staticmethod
-    def cubin_to_code(name):
-        fname = 'hgemm_{0}.cubin'.format(name)
-        with open(os.path.join(nerv.cubin_path, fname)) as f:
-            cubin = f.read()
-        bcode = ','.join(hex(ord(c)) for c in cubin)
-        return "static const char bin_%s[] = { %s };" % (name, bcode)
-    @staticmethod
-    def init_gpukernel(name, fail):
-        return """
-bcode = bin_%(name)s;
-sz = sizeof(bin_%(name)s);
-if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
-                   "hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
-    != GA_NO_ERROR) {
-  PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
-  %(fail)s;
-}
-""" % dict(name=name, fail=fail)
-    def c_support_code(self):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append(Gemm16.cubin_to_code(name))
-        return '\n'.join(codel)
-    def c_support_code_struct(self, node, nodename):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append("GpuKernel k_{0};".format(name))
-        codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
-        return '\n'.join(codel)
-    def c_init_code_struct(self, node, nodename, sub):
-        codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
-        for name in self.KERN_NAMES:
-            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
-        codel.append("const char *bcode;")
-        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = %s;" % (sub['params'],))
-        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
-                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
-                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
-        for name in self.KERN_NAMES:
-            codel.append(self.init_gpukernel(name, sub['fail']))
-        return '\n'.join(codel)
-    def c_cleanup_code_struct(self, node, nodename):
-        codel = []
-        for name in self.KERN_NAMES:
-            codel.append("GpuKernel_clear(&k_{0});".format(name))
-        return '\n'.join(codel)
-@opt.register_opt('fast_compile')
-@opt.op_lifter([tensor.Dot])
-@opt.register_opt2([tensor.Dot], 'fast_compile')
-def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
-    if nerv is None:
-        return
-    A = inputs[0]
-    B = inputs[1]
-    if (A.ndim == 2 and B.ndim == 2 and
-            A.dtype == 'float16' and B.dtype == 'float16'):
-        fgraph = getattr(outputs[0], 'fgraph', None)
-        C = GpuAllocEmpty('float16', ctx_name)(
-            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
-        return Gemm16()(C, 1.0, A, B, 0.0)
-@opt.register_opt()
-@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
-def local_gemm16_alpha_merge(node, *inputs):
-    return [Gemm16(relu=node.op.relu)(*inputs)]
-@opt.register_opt()
-@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
-def local_gemm16_output_merge(node, *inputs):
-    return [Gemm16(relu=node.op.relu)(*inputs)]
-@local_optimizer([Gemm16], inplace=True)
-def local_gemm16_inplace(node):
-    if type(node.op) != Gemm16 or node.op.inplace:
-        return
-    inputs = list(node.inputs)
-    C = inputs[0]
-    if (C.owner and
-            isinstance(C.owner.op, GpuAllocEmpty) and
-            len(C.clients) > 1):
-        inputs[0] = C.owner.op(*C.owner.inputs)
-    return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
-optdb.register('local_gemm16_inplace',
-               tensor.opt.in2out(local_gemm16_inplace,
-                                 name='local_gemm16_inplace'),
-               70.0, 'fast_run', 'inplace', 'gpuarray')
--- a/theano/gpuarray/tests/test_nerv.py
+++ b/theano/gpuarray/tests/test_nerv.py
-from __future__ import absolute_import, print_function, division
-from nose.plugins.skip import SkipTest
-import numpy as np
-from theano import function
-from theano.tests import unittest_tools as utt
-from theano.tensor import vector, matrix, dot
-from .config import mode_with_gpu
-from ..nerv import Gemm16, nerv
-def test_gemm16_swap():
-    if nerv is None:
-        raise SkipTest("nervanagpu not available")
-    v = vector(dtype='float16')
-    m = matrix(dtype='float16')
-    m2 = matrix(dtype='float16')
-    m32 = matrix(dtype='float32')
-    # test that we don't try to replace anything but matrix x matrix in float16
-    f = function([v, m], dot(v, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 0
-    f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 0
-    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
-    assert len([node for node in f.maker.fgraph.apply_nodes
-                if isinstance(node.op, Gemm16)]) == 1
-def test_gemm16_value():
-    if nerv is None:
-        raise SkipTest("nervanagpu not available")
-    m = matrix(dtype='float16')
-    m2 = matrix(dtype='float16')
-    f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
-    v1 = np.random.random((3, 4)).astype('float16')
-    v2 = np.random.random((4, 2)).astype('float16')
-    of = f(v1, v2)
-    on = np.dot(v1, v2)
-    utt.assert_allclose(of, on)