Update nerv ops for type context.

This unfortunately renders pycuda unusable without a significant amount of work which I am not ready to do right now. Since we have C code for this op I just removed the python code.

Update nerv ops for type context.
9eeafd89 · Arnaud Bergeron · 6c4bdd7d · 9eeafd89 · 9eeafd89 · 6c4bdd7d
--- a/theano/sandbox/gpuarray/gemm16.c
+++ b/theano/sandbox/gpuarray/gemm16.c
@@ -2,7 +2,7 @@
 /* Why do we need this? */
 size_t dim = 2048 * 32;
-rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
+rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
                       Py_None);
 if (rand_buf == NULL) {
  FAIL;

--- a/theano/sandbox/gpuarray/nerv.py
+++ b/theano/sandbox/gpuarray/nerv.py
@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
 from theano.scalar import as_scalar, constant
 from . import opt
-from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
+from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
+                        infer_context_name)
+from .type import gpu_context_type
 from .opt_util import alpha_merge, output_merge
-from .pycuda_helper import ensure_pycuda_context
 try:
    from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
@@ -43,6 +43,7 @@ def ensure_float(val, name):
 class Gemm16(COp):
    __props__ = ('relu', 'inplace')
    _f16_ok = True
+    context_type = gpu_context_type
    KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
                  'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
                  'tn_128x128', 'tn_128x64', 'tn_128x32',
@@ -61,10 +62,11 @@ class Gemm16(COp):
    def make_node(self, C, alpha, A, B, beta):
        if GPUTensor is None:
            raise RuntimeError("Can't use Gemm16: nervanagpu not found")
+        ctx_name = infer_context_name(C, A, B)
-        A = as_gpuarray_variable(A)
+        A = as_gpuarray_variable(A, ctx_name)
-        B = as_gpuarray_variable(B)
+        B = as_gpuarray_variable(B, ctx_name)
-        C = as_gpuarray_variable(C)
+        C = as_gpuarray_variable(C, ctx_name)
        alpha = ensure_float(alpha, 'alpha')
        beta = ensure_float(beta, 'beta')
@@ -73,27 +75,8 @@ class Gemm16(COp):
        return Apply(self, [C, alpha, A, B, beta], [C.type()])
-    def perform(self, node, inputs, outputs):
+    def get_context(self, node):
-        ensure_pycuda_context()
+        return node.inputs[0].type.context
-        C, alpha, A, B, beta = inputs
-        # The nervana code does not support the case where both inputs
-        # are trans, so we need to copy one if them if that is the
-        # case. We copy the smaller one.
-        if A.flags.f_contiguous and B.flags.f_contiguous:
-            if A.size < B.size:
-                A = A.copy()
-            else:
-                B = B.copy()
-        inplace = self.inplace
-        if inplace and not C.flags.c_contiguous:
-            inplace = False
-        if not inplace:
-            C = C.copy()
-        At = to_gputensor(A)
-        Bt = to_gputensor(B)
-        Ct = to_gputensor(C)
-        nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
-        outputs[0][0] = C
    def c_headers(self):
        return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
            codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
        codel.append("const char *bcode;")
        codel.append("size_t sz;")
-        codel.append("PyGpuContextObject *c = pygpu_default_context();")
+        codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
        codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
                     "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
                     "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
@@ -170,7 +153,8 @@ def local_dot_to_gemm16(node):
    if (A.ndim == 2 and B.ndim == 2 and
            A.dtype == 'float16' and B.dtype == 'float16'):
        fgraph = node.inputs[0].fgraph
-        C = GpuAllocEmpty(dtype='float16')(
+        ctx_name = infer_context_name(A, B)
+        C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
            shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
        return Gemm16()(C, 1.0, A, B, 0.0)

--- a/theano/sandbox/gpuarray/pycuda_helper.py
+++ b/theano/sandbox/gpuarray/pycuda_helper.py
-try:
-    from pycuda.driver import Context
-    if not hasattr(Context, 'attach'):
-        raise ImportError('too old')
-except ImportError:
-    Context = None
-pycuda_initialized = False
-pycuda_context = None
-def ensure_pycuda_context():
-    global pycuda_context, pycuda_initialized
-    if not pycuda_initialized:
-        if Context is None:
-            raise RuntimeError("PyCUDA not found or too old.")
-        else:
-            pycuda_context = Context.attach()
-            import atexit
-            atexit.register(pycuda_context.detach)
-            pycuda_initialized = True
-    return pycuda_context