Merge pull request #4542 from abergeron/api_changes

libgpuarray api changes

Merge pull request #4542 from abergeron/api_changes
b895c6e8 · Frédéric Bastien · 19619e9a · c59491c7 · b895c6e8 · b895c6e8
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -42,7 +42,7 @@ register_transfer(transfer)

 def init_dev(dev, name=None):
    v = pygpu.gpuarray.api_version()
-    expected = -9998
+    expected = -9997
    if v[0] != expected:
        raise RuntimeError("Wrong major API version for gpuarray:", v[0],
                           "Make sure Theano and libgpuarray/pygpu "

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -259,14 +259,14 @@ class GpuKernelBase(object):
  int types[%(numargs)u] = {%(types)s};
  const char *bcode = %(bvar)s;
  size_t sz = sizeof(%(bvar)s);
-  if (GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1, &bcode, &sz,
+  if (GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1, &bcode, &sz,
                     "%(kname)s", %(numargs)u, types, GA_USE_BINARY, NULL)
      != GA_NO_ERROR) {
-    if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
+    if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ctx, 1,
                              &%(cname)s, NULL, "%(kname)s", %(numargs)u,
                              types, %(flags)s, NULL)) != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
-                   err, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, err));
+                   err, gpucontext_error(%(ctx)s->ctx, err));
      %(fail)s
    }
  }
@@ -310,7 +310,7 @@ class GpuKernelBase(object):
            The node that we need the cache version for.

        """
-        return (3, self.get_params(node).bin_id)
+        return (4, self.get_params(node).bin_id)


 class HostFromGpu(Op):

--- a/theano/gpuarray/blockgemv.c
+++ b/theano/gpuarray/blockgemv.c
@@ -24,16 +24,9 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
  size_t *offW = NULL;
  size_t *offInp = NULL;
  size_t *offOut = NULL;
-  gpuarray_blas_ops *blas_ops;
  int err;

-  err = ctx->ops->property(ctx->ctx, NULL, NULL,
-                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
-    return -1;
-  }
-  err = blas_ops->setup(ctx->ctx);
+  err = gpublas_setup(ctx->ctx);
  if (err != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
    return -1;
@@ -93,29 +86,29 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
  }

  if (out->ga.typecode == GA_FLOAT) {
-    err = blas_ops->sgemvBatch(cb_fortran, transA,
-                               PyGpuArray_DIMS(out)[2],
-                               PyGpuArray_DIMS(h)[2], 1,
-                               W_list, offW, lda,
-                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+    err = gpublas_sgemvBatch(cb_fortran, transA,
+                             PyGpuArray_DIMS(out)[2],
+                             PyGpuArray_DIMS(h)[2], 1,
+                             W_list, offW, lda,
+                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
  } else if (out->ga.typecode == GA_DOUBLE) {
-    err = blas_ops->dgemvBatch(cb_fortran, transA,
-                               PyGpuArray_DIMS(out)[2],
-                               PyGpuArray_DIMS(h)[2], 1,
-                               W_list, offW, lda,
-                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+    err = gpublas_dgemvBatch(cb_fortran, transA,
+                             PyGpuArray_DIMS(out)[2],
+                             PyGpuArray_DIMS(h)[2], 1,
+                             W_list, offW, lda,
+                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
  } else if (out->ga.typecode == GA_HALF) {
-    err = blas_ops->sgemvBatch(cb_fortran, transA,
-                               PyGpuArray_DIMS(out)[2],
-                               PyGpuArray_DIMS(h)[2], 1,
-                               W_list, offW, lda,
-                               inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
-                               1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
-                               PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
+    err = gpublas_sgemvBatch(cb_fortran, transA,
+                             PyGpuArray_DIMS(out)[2],
+                             PyGpuArray_DIMS(h)[2], 1,
+                             W_list, offW, lda,
+                             inp_list, offInp, PyGpuArray_STRIDES(h)[2] / gpuarray_get_elsize(h->ga.typecode),
+                             1, out_list, offOut, PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode),
+                             PyGpuArray_DIMS(out)[1] * PyGpuArray_DIMS(h)[1] * PyGpuArray_DIMS(out)[0], 0);
  } else {
    err = GA_INVALID_ERROR;
  }

--- a/theano/gpuarray/blockger.c
+++ b/theano/gpuarray/blockger.c
@@ -12,16 +12,9 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
  size_t *offOut = NULL;
  size_t *offX = NULL;
  size_t *offY = NULL;
-  gpuarray_blas_ops *blas_ops;
  int err;

-  err = ctx->ops->property(ctx->ctx, NULL, NULL,
-                           GA_CTX_PROP_BLAS_OPS, &blas_ops);
-  if (err != GA_NO_ERROR) {
-    PyErr_SetString(PyExc_RuntimeError, "Can't get blas ops");
-    return -1;
-  }
-  err = blas_ops->setup(ctx->ctx);
+  err = gpublas_setup(ctx->ctx);
  if (err != GA_NO_ERROR) {
    PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
    return -1;
@@ -84,26 +77,26 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
  ssize_t str_out = PyGpuArray_STRIDES(out)[2] / gpuarray_get_elsize(out->ga.typecode);

  if (out->ga.typecode == GA_FLOAT) {
-    err = blas_ops->sgerBatch(cb_fortran,
-                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                              *(float *)PyArray_GETPTR1(alpha, 0),
-                              y_list, offY, str_y, x_list, offX, str_x,
-                              o_list, offOut, str_out,
-                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+    err = gpublas_sgerBatch(cb_fortran,
+                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                            *(float *)PyArray_GETPTR1(alpha, 0),
+                            y_list, offY, str_y, x_list, offX, str_x,
+                            o_list, offOut, str_out,
+                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
  } else if (out->ga.typecode == GA_DOUBLE) {
-    err = blas_ops->dgerBatch(cb_fortran,
-                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                              *(double *)PyArray_GETPTR1(alpha, 0),
-                              y_list, offY, str_y, x_list, offX, str_x,
-                              o_list, offOut, str_out,
-                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+    err = gpublas_dgerBatch(cb_fortran,
+                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                            *(double *)PyArray_GETPTR1(alpha, 0),
+                            y_list, offY, str_y, x_list, offX, str_x,
+                            o_list, offOut, str_out,
+                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
  } else if (out->ga.typecode == GA_HALF) {
-    err = blas_ops->hgerBatch(cb_fortran,
-                              PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
-                              *(float *)PyArray_GETPTR1(alpha, 0),
-                              y_list, offY, str_y, x_list, offX, str_x,
-                              o_list, offOut, str_out,
-                              PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
+    err = gpublas_hgerBatch(cb_fortran,
+                            PyGpuArray_DIMS(y)[2], PyGpuArray_DIMS(x)[2],
+                            *(float *)PyArray_GETPTR1(alpha, 0),
+                            y_list, offY, str_y, x_list, offX, str_x,
+                            o_list, offOut, str_out,
+                            PyGpuArray_DIMS(x)[0] * PyGpuArray_DIMS(x)[1] * PyGpuArray_DIMS(y)[1], 0);
  } else {
    err = GA_INVALID_ERROR;
  }

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -125,7 +125,7 @@ def dnn_available(context_name):

    ctx = get_context(context_name)

-    if not ctx.kind == 'cuda':
+    if not ctx.kind == b'cuda':
        dnn_available.msg = "Not on a CUDA device."
        return False


--- a/theano/gpuarray/dnn_fwd.c
+++ b/theano/gpuarray/dnn_fwd.c
@@ -105,7 +105,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
    algo = choice.algo;
 #else
    size_t free;
-    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);

    if (err2 != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
@@ -234,7 +234,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
     * to place a nice get_work_mem() function in.
     */
    if (worksize != 0) {
-      workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+      workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
      if (workspace == NULL) {
        PyErr_SetString(PyExc_RuntimeError,
                        "Could not allocate working memory");
@@ -258,7 +258,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
      APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));

    if (worksize != 0)
-      c->ops->buffer_release(workspace);
+      gpudata_release(workspace);

    cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
    cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);

--- a/theano/gpuarray/dnn_gi.c
+++ b/theano/gpuarray/dnn_gi.c
@@ -106,7 +106,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
    size_t free;
-    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);

    if (err2 != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
@@ -204,7 +204,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
  }

  if (worksize != 0) {
-    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError,
                      "Could not allocate working memory");
@@ -227,7 +227,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
    APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input));

  if (worksize != 0)
-    c->ops->buffer_release(workspace);
+    gpudata_release(workspace);

  cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);

--- a/theano/gpuarray/dnn_gw.c
+++ b/theano/gpuarray/dnn_gw.c
@@ -107,7 +107,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    algo = choice.algo;
 #else
    size_t free;
-    int err2 = c->ops->property(c->ctx, NULL, NULL, GA_CTX_PROP_FREE_GMEM, &free);
+    int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_FREE_GMEM, &free);

    if (err2 != GA_NO_ERROR) {
      PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
@@ -192,7 +192,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
  }

  if (worksize != 0) {
-    workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
+    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
    if (workspace == NULL) {
      PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
      cuda_exit(c->ctx);
@@ -214,7 +214,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
    APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns));

  if (worksize != 0)
-    c->ops->buffer_release(workspace);
+    gpudata_release(workspace);

  cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
  cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -199,7 +199,7 @@ class GpuElemwise(HideC, Elemwise):
                           typecode=o.type.typecode)

        res += """
-        ge = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
+        ge = GpuElemwise_new(%(ctx)s->ctx, %(support)s, %(kop)s, %(nargs)s, args, %(nd)s, 0);
        if (ge == NULL) {
           PyErr_SetString(PyExc_RuntimeError, "Could not initialize elemwise support");
           %(fail)s
@@ -360,7 +360,7 @@ class GpuElemwise(HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (6, ver)
+            return (7, ver)
        else:
            return ver

@@ -554,7 +554,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):

    def make_node(self, x):
        x = as_gpuarray_variable(x, infer_context_name(x))
-        if x.type.context.kind != 'cuda':
+        if x.type.context.kind != b'cuda':
            raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)

--- a/theano/gpuarray/extra_ops.py
+++ b/theano/gpuarray/extra_ops.py
@@ -26,11 +26,8 @@ class GpuCumsum(GpuKernelBase, Op):
    def __init__(self, axis):
        self.axis = axis

-    def __str__(self):
-        return "%s{%s}" % (self.__class__.__name__, self.axis)
-
-    def c_code_cache_version_apply(self, node):
-        return (1,)
+    def c_code_cache_version(self):
+        return (3,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>', '<gpuarray_helper.h>']
@@ -221,7 +218,7 @@ class GpuCumsum(GpuKernelBase, Op):
        return kernels

    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
        x, = inp
        z, = out
@@ -249,17 +246,17 @@ class GpuCumsum(GpuKernelBase, Op):
                size_t max_grid_size1;
                size_t max_grid_size2;
                int err;
-                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
+                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
-                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
+                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
                    %(fail)s;
                }
-                err = %(ctx)s->ops->property(%(ctx)s->ctx, NULL, NULL, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
+                err = gpucontext_property(%(ctx)s->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;

--- a/theano/gpuarray/gemm16.c
+++ b/theano/gpuarray/gemm16.c
@@ -117,7 +117,7 @@ int gemm16(PyGpuArrayObject *C, float alpha,
        if (48 < n128 && n128 <= 64) {
          n64 = n / 64;
          if (nprocs == 0)
-            if (A->ga.ops->property(A->context->ctx, NULL, NULL,
+            if (gpucontext_property(A->context->ctx,
                                    GA_CTX_PROP_NUMPROCS, &nprocs)) {
              nprocs = 0;
              res = 1;

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -243,7 +243,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
        return kernels

    def c_code(self, node, name, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype

--- a/theano/gpuarray/nerv.py
+++ b/theano/gpuarray/nerv.py
@@ -105,7 +105,7 @@ class Gemm16(COp):
        return """
 bcode = bin_%(name)s;
 sz = sizeof(bin_%(name)s);
-if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
+if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
                   "hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
    != GA_NO_ERROR) {
  PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");

--- a/theano/gpuarray/nnet.py
+++ b/theano/gpuarray/nnet.py
@@ -189,7 +189,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
                       flags=flags, objvar=k_var)]

    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError('cuda only')
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
@@ -375,7 +375,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
        typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
@@ -584,7 +584,7 @@ class GpuSoftmax(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError("cuda only")
        dtype_x = node.inputs[0].dtype
        work_x = work_dtype(dtype_x)
@@ -783,7 +783,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def c_code(self, node, nodename, inp, out, sub):
-        if node.inputs[0].type.context.kind != 'cuda':
+        if node.inputs[0].type.context.kind != b'cuda':
            raise NotImplementedError('cuda only')
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -146,7 +146,7 @@ def op_lifter(OP, cuda_only=False):
                # Check if we should replace
                if (not replace or
                    (cuda_only and
-                     get_context(context_name).kind != 'cuda')):
+                     get_context(context_name).kind != b'cuda')):
                    return False

                # tag the inputs with the context in case
@@ -643,7 +643,7 @@ def local_gpua_advanced_subtensor(node, context_name):
 def local_gpua_advanced_incsubtensor(node, context_name):
    context = get_context(context_name)
    # This is disabled on non-cuda contexts
-    if context.kind != 'cuda':
+    if context.kind != b'cuda':
        return None

    x, y, ilist = node.inputs
@@ -674,12 +674,12 @@ def local_gpua_careduce(node, context_name):
    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
                                      scalar.Maximum, scalar.Minimum)):
        ctx = get_context(context_name)
-        if ctx.kind == 'opencl':
+        if ctx.kind == b'opencl':
            op = GpuCAReduceCPY
            if node.op.scalar_op not in [scalar.add, scalar.mul]:
                # We don't support yet all reduction with cpy code.
                return
-        elif ctx.kind == 'cuda':
+        elif ctx.kind == b'cuda':
            op = GpuCAReduceCuda
        else:
            return False

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -340,7 +340,7 @@ class GpuIncSubtensor(IncSubtensor):
        args[1].name = "b";
        args[1].typecode = %(type2)s;
        args[1].flags = GE_READ;
-        iadd = GpuElemwise_new(%(ctx)s->ops, %(ctx)s->ctx, "", "a += b",
+        iadd = GpuElemwise_new(%(ctx)s->ctx, "", "a += b",
                               2, args, %(nd)s, 0);
        if (iadd == NULL) {
          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
@@ -369,7 +369,7 @@ class GpuIncSubtensor(IncSubtensor):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if not parent_version:
            return
-        return parent_version + (5,)
+        return parent_version + (6,)


 class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
@@ -437,8 +437,7 @@ if (err != GA_NO_ERROR) {
  if (err == GA_VALUE_ERROR) {
    PyErr_SetString(PyExc_IndexError, "Index out of bounds.");
  } else {
-    PyErr_SetString(PyExc_RuntimeError, Gpu_error(%(v)s->context->ops,
-                                                  %(v)s->context->ctx, err));
+    PyErr_SetString(PyExc_RuntimeError, GpuArray_error(&%(v)s->ga, err));
  }
  %(fail)s
 }
@@ -589,7 +588,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
        return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)

    def c_code_cache_version(self):
-        return (6,)
+        return (8,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>',
@@ -600,7 +599,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):

    def c_code(self, node, name, inputs, outputs, sub):
        ctx = self.get_params(node)
-        if ctx.kind != 'cuda':
+        if ctx.kind != b'cuda':
            raise NotImplementedError("cuda only")
        if (self.set_instead_of_inc or
                node.inputs[0].ndim != node.inputs[1].ndim or
@@ -757,8 +756,8 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
            int err, kerr = 0;

            if (threads_per_block[0] > 0 && n_blocks[0] > 0) {
-              err = py_self->ga.ops->property(NULL, py_self->ga.data, NULL,
-                                              GA_CTX_PROP_ERRBUF, &errbuf);
+              err = gpudata_property(py_self->ga.data,
+                                     GA_CTX_PROP_ERRBUF, &errbuf);
              if (err != GA_NO_ERROR) {
                PyErr_SetString(PyExc_RuntimeError, "Can't fetch error buffer");
                return 1;
@@ -793,7 +792,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
                             GpuKernel_error(&%(k_var)s, err));
                return 1;
              }
-              err = py_self->ga.ops->buffer_read(&kerr, errbuf, 0, sizeof(int));
+              err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
              if (err != GA_NO_ERROR) {
                PyErr_SetString(PyExc_RuntimeError, "Can't read error buffer");
                return 1;
@@ -801,7 +800,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
              if (kerr != 0) {
                PyErr_SetString(PyExc_IndexError, "Index out of bounds");
                kerr = 0;
-                py_self->ga.ops->buffer_write(errbuf, 0, &kerr, sizeof(int));
+                gpudata_write(errbuf, 0, &kerr, sizeof(int));
                return 1;
              }
            }

--- a/theano/gpuarray/tests/test_elemwise.py
+++ b/theano/gpuarray/tests/test_elemwise.py
@@ -197,7 +197,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):

    def setUp(self):
        super(test_GpuCAReduceCuda, self).setUp()
-        if get_context(test_ctx_name).kind != 'cuda':
+        if get_context(test_ctx_name).kind != b'cuda':
            raise SkipTest("Cuda specific tests")


@@ -212,7 +212,7 @@ class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
              'float32', 'float64']

    def setUp(self):
-        if get_context(test_ctx_name).kind != 'cuda':
+        if get_context(test_ctx_name).kind != b'cuda':
            raise SkipTest("Cuda specific tests")



--- a/theano/gpuarray/tests/test_extra_ops.py
+++ b/theano/gpuarray/tests/test_extra_ops.py
@@ -24,7 +24,7 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
    def setUp(self):
        super(TestGpuCumsum, self).setUp()
        test_ctx = get_context(test_ctx_name)
-        if test_ctx.kind != 'cuda':
+        if test_ctx.kind != b'cuda':
            raise SkipTest("Cuda specific tests")
        self.max_threads_dim0 = test_ctx.maxlsize0
        self.max_grid_size1 = test_ctx.maxgsize2

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -125,7 +125,7 @@ def test_reduce():
        topo = f.maker.fgraph.toposort()
        ops = [type(node.op) for node in topo]

-        if kind == 'opencl' and method in ["max", "min"]:
+        if kind == b'opencl' and method in ["max", "min"]:
            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
        else:
            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops